def print_meme_header(alph): sys.stdout.write("\nMEME version 4\n\n") if alph == alphabet.getByName("DNA") or alph == alphabet.getByName("Protein"): sys.stdout.write("ALPHABET= {}\n\n".format("".join(alph.getSymbols()))) else: sys.stdout.write("ALPHABET {}\n".format(json.dumps(alph.getName()))) sys.stdout.write(alph.asText()) sys.stdout.write("END ALPHABET\n\n") if alph.isComplementable(): sys.stdout.write("strands: + -\n\n") sys.stdout.write("Background letter frequencies (from uniform background):\n") freq = 1.0 / alph.getLen() for sym in alph.getSymbols(): sys.stdout.write("{:s} {:.4f} ".format(sym, freq)) sys.stdout.write("\n");
def __init__(self, sequence, alpha=None, name="", seqinfo=""): """Create a sequence with sequence data. Specifying the alphabet is optional, so is the name and info. Example: >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF') will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used. >>> myseq.getAlphabet().getSymbols() will most likely output the standard protein alphabet: ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') """ self.name = name self.info = seqinfo if type(sequence) is str: self.data = tuple(sequence) elif type(sequence) is tuple: self.data = sequence elif type(sequence) is list: self.data = tuple(sequence) else: raise RuntimeError( "Sequence data is not specified correctly: must be string or tuple" ) # Resolve choice of alphabet if alpha == None: # Alphabet is not set, attempt to set it automatically... alpha = alphabet.getBySeq(self.data) if alpha == None: raise RuntimeError("Could not identify alphabet from sequence") elif isinstance(alpha, basestring): alphaname = alpha alpha = alphabet.getByName(alphaname) if alpha == None: raise RuntimeError("No predefined alphabet with name \"" + alphaname + "\"") if not (alpha.isValidString(self.data)): raise RuntimeError("Invalid alphabet specified: " + "".join(alpha.getSymbols()) + " is not compatible with sequence '" + "".join(self.data) + "'") elif isinstance(alpha, alphabet.Alphabet): if not (alpha.isValidString(self.data)): raise RuntimeError("Invalid alphabet specified: " + "".join(alpha.getSymbols()) + " is not compatible with sequence '" + "".join(self.data) + "'") else: raise RuntimeError("Unexpected type for alpha")
def __init__(self, sequence, alpha = None, name = "", seqinfo = ""): """Create a sequence with sequence data. Specifying the alphabet is optional, so is the name and info. Example: >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF') will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used. >>> myseq.getAlphabet().getSymbols() will most likely output the standard protein alphabet: ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') """ self.name = name self.info = seqinfo if type(sequence) is str: self.data = tuple(sequence) elif type(sequence) is tuple: self.data = sequence elif type(sequence) is list: self.data = tuple(sequence) else: raise RuntimeError("Sequence data is not specified correctly: must be string or tuple") # Resolve choice of alphabet if alpha == None: # Alphabet is not set, attempt to set it automatically... alpha = alphabet.getBySeq(self.data) if alpha == None: raise RuntimeError("Could not identify alphabet from sequence") elif isinstance(alpha, basestring): alphaname = alpha alpha = alphabet.getByName(alphaname) if alpha == None: raise RuntimeError("No predefined alphabet with name \"" + alphaname + "\"") if not(alpha.isValidString(self.data)): raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'") elif isinstance(alpha, alphabet.Alphabet): if not(alpha.isValidString(self.data)): raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'") else: raise RuntimeError("Unexpected type for alpha")
def convert_ambigs(strings, alph): """Convert aliases to prime symbol and ambiguous to wildcard in each of a list of strings. Changes are made in place. """ ms = alph.translator(False) for i in range(len(strings)): strings[i] = strings[i].translate(ms) return(strings) #------------------ Main method ------------------- # Executed if you run this file from the operating system prompt, e.g. # > python sequence.py if __name__=='__main__': alpha = alphabet.getByName('DNA') #seqs = readFASTA('pos.fasta') seqs = [] aln = readStrings('tmp0') #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].') pwm = PWM(alpha) pwm.setFromAlignment(aln) for row in pwm.pretty(): print row for s in seqs: print s.getName(), s.getLen(), s.getAlphabet().getSymbols() for m in regexp.match( s ): print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) for m in pwm.match( s ): print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
def convert_ambigs(strings, alph): """Convert aliases to prime symbol and ambiguous to wildcard in each of a list of strings. Changes are made in place. """ ms = alph.translator(False) for i in range(len(strings)): strings[i] = strings[i].translate(ms) return (strings) #------------------ Main method ------------------- # Executed if you run this file from the operating system prompt, e.g. # > python sequence.py if __name__ == '__main__': alpha = alphabet.getByName('DNA') #seqs = readFASTA('pos.fasta') seqs = [] aln = readStrings('tmp0') #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].') pwm = PWM(alpha) pwm.setFromAlignment(aln) for row in pwm.pretty(): print row for s in seqs: print s.getName(), s.getLen(), s.getAlphabet().getSymbols() for m in regexp.match(s): print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) for m in pwm.match(s): print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])