def test005(): print "** Test 005 **" (headers,seqs) = biofile.readFASTA("test-paml/paml-test-seqfile-0001.txt") tree = newick.tree.parseTree(file("test-paml/paml-test-treefile-0001.txt", 'r').readlines()[1]) cm = paml.CodeML("codon", paml.CodeML.FMutSel_F_options) cm.loadSequences(seqs, headers, str(tree)) cm.run() cm.putBranchRatesOnTree(headers, tree) cm.cleanUp()
def test005(): print "** Test 005 **" (headers, seqs) = biofile.readFASTA("test-paml/paml-test-seqfile-0001.txt") tree = newick.tree.parseTree( file("test-paml/paml-test-treefile-0001.txt", 'r').readlines()[1]) cm = paml.CodeML("codon", paml.CodeML.FMutSel_F_options) cm.loadSequences(seqs, headers, str(tree)) cm.run() cm.putBranchRatesOnTree(headers, tree) cm.cleanUp()
optdict = vars(options) for (k,v) in sorted(optdict.items()): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) def secondField(h): f = None try: f = biofile.secondField(h) except: f = biofile.firstField(h) return f # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fname, 'r')) #, key_fxn=biofile.secondField) if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] if not options.dont_align_sequences: aligned_seqs = muscle.alignSequences(seqs) seqs = aligned_seqs zhs = [(h,s) for (h,s) in zip(headers,seqs) if not s is None] all_keys = [biofile.firstField(h) for (h,s) in zhs] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h,s) in zhs]) gene_orf_dict = dict([(secondField(h), biofile.firstField(h)) for h in headers]) orf_gene_dict = dict([(v,k) for (k,v) in gene_orf_dict.items()]) # Write output n_written = 0 data_outs.write("header\n")
# Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k, v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format( options.in_fname)) with open(options.in_fname, 'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(seqs))) new_headers = headers new_seqs = seqs for append_fname in options.append_fasta: if not os.path.isfile(append_fname): raise IOError( "# Error: file {} does not exist".format(append_fname)) with open(append_fname, 'r') as inf: # Read a FASTA file? (app_headers, app_seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(app_seqs))) assert len(app_seqs) == len(new_seqs) if options.check_headers: for (h1, h2) in zip(headers, app_headers):
random.seed(options.seed) # Start up output info_outs = util.OutStreams([sys.stdout]) data_outs = util.OutStreams() multi_outs = util.OutStreams([info_outs, data_outs]) if not options.out_fname is None: outf = file(options.out_fname, 'w') data_outs.addStream(outf) else: data_outs.addStream(sys.stdout) # Read sequences from a FASTA file? fname = os.path.expanduser(seq) if os.path.isfile(fname): (headers, seq_list) = biofile.readFASTA(fname) seqs = zip(headers, seq_list) info_outs.write("# Read {0:d} sequences from {1}\n".format( len(seqs), fname)) else: seqs = [("command-line input", seq.upper())] info_outs.write("# Read sequence L={:d} from standard input\n".format( len(seqs[0][1]))) # If reverse-translation is desired, do it. if options.reverse_translate: new_seqs = [] for (h, s) in seqs: rev_trans_seq = translate.reverseTranslate(s) new_seqs.append((h, rev_trans_seq)) seqs = new_seqs
aas = None if not options.aas is None: if options.aas.lower() == 'all': aas = translate.AAs() else: aas = [aa for aa in options.aas] # Single sequence? if not options.sequence is None: headers = ['Input'] seqs = [options.sequence] else: if not options.in_fname is None: fname = os.path.expanduser(options.in_fname) #print(fname) (headers, seqs) = biofile.readFASTA(open(fname, 'r')) else: info_outs.write("# No sequence or file provided; exiting\n") sys.exit() #print("# Found", len(seqs), "sequences") #print("# Found", len(headers), "headers") ''' if options.report: # Write a long report per protein for (hdr, seq) in zip(headers,seqs): if options.degap: seq = seq.replace('-','') if not options.end_aa is None and options.end_aa<= len(seq): seq = seq[0:options.end_aa] #print options.end_aa, options.begin_aa seq = seq[options.begin_aa:] outs.write("length = {:d}\n".format(pp.getLength(seq)))
random.seed(options.seed) # Start up output info_outs = util.OutStreams([sys.stdout]) data_outs = util.OutStreams() multi_outs = util.OutStreams([info_outs, data_outs]) if not options.out_fname is None: outf = file(options.out_fname,'w') data_outs.addStream(outf) else: data_outs.addStream(sys.stdout) # Read sequences from a FASTA file? fname = os.path.expanduser(seq) if os.path.isfile(fname): (headers, seq_list) = biofile.readFASTA(fname) seqs = zip(headers,seq_list) info_outs.write("# Read {0:d} sequences from {1}\n".format(len(seqs), fname)) else: seqs = [("command-line input",seq.upper())] info_outs.write("# Read sequence L={:d} from standard input\n".format(len(seqs[0][1]))) # If reverse-translation is desired, do it. if options.reverse_translate: new_seqs = [] for (h,s) in seqs: rev_trans_seq = translate.reverseTranslate(s) new_seqs.append((h,rev_trans_seq)) seqs = new_seqs # Obtain gene sequence using only optimal codons
if local_fname is None: local_fname = "tmp{:d}".format(random.randint(0, 1e20)) # Fetch file from OrthoDB if not options.orthodb_id is None: #local_fname = "uniprot-yeast.txt" remote_fname = "http://cegg.unige.ch/orthodb7/fasta.fasta?ogs={:s}".format( options.orthodb_id) urllib.urlretrieve(remote_fname, local_fname) print "# Downloaded {} to {}".format(remote_fname, local_fname) info_outs.write("# Downloaded {} to {}\n".format( remote_fname, local_fname)) # Read input if not os.path.isfile(local_fname): raise IOError("# Error: file {} does not exist".format(local_fname)) with open(local_fname, 'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) if options.fasta_out_fname is None: # Write data biofile.writeFASTA(seqs, fasta_outs, headers=headers) # Write out stopping time info_outs.write("# Run finished {}\n".format(util.timestamp())) # Shut down output if not options.fasta_out_fname is None: info_outs.write("# Fetched {} sequences to {}\n".format( len(headers), options.fasta_out_fname)) outf.close()
import sys, os, math, string import biofile if __name__ == '__main__': (h, s) = biofile.readFASTA('test-biofile/test-biofile-001.fa') assert len(h) == 143 cd = biofile.readFASTADict( os.path.expanduser('test-biofile/test-biofile-001.fa')) assert len(cd.keys()) == len(h)
frags = digestWithEnzyme(seq, "trypsin", 1) assert set(frags) == set(["VR", "VRK", "KT", "K", "T"]) print "\ttest006 passed" if __name__ == '__main__': fname = sys.argv[1] if fname == "__test__": print "Running tests..." test001() test002() test003() test004() test005() test006() print "All tests passed" sys.exit() patterns = sys.argv[2].split("/") complete = True if os.path.isfile(os.path.expanduser(fname)): (headers, seqs) = biofile.readFASTA(os.path.expanduser(fname)) else: seqs = [fname] for seq in seqs: frags = digest(seq, patterns, complete) #print "\n%s\n---" % seq for f in frags: print len(f), f if complete: assert ''.join(frags) == seq
aas = None if not options.aas is None: if options.aas.lower() == 'all': aas = translate.AAs() else: aas = [aa for aa in options.aas] # Single sequence? if not options.sequence is None: headers = ['Input'] seqs = [options.sequence] else: if not options.in_fname is None: fname = os.path.expanduser(options.in_fname) #print(fname) (headers,seqs) = biofile.readFASTA(open(fname, 'r')) else: info_outs.write("# No sequence or file provided; exiting\n") sys.exit() #print("# Found", len(seqs), "sequences") #print("# Found", len(headers), "headers") ''' if options.report: # Write a long report per protein for (hdr, seq) in zip(headers,seqs): if options.degap: seq = seq.replace('-','') if not options.end_aa is None and options.end_aa<= len(seq): seq = seq[0:options.end_aa] #print options.end_aa, options.begin_aa seq = seq[options.begin_aa:]
data_outs.addStream(sys.stdout) # Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.in_fname,'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) for fname in options.other_fasta_fnames: if not os.path.isfile(fname): raise IOError("# Error: file {} does not exist".format(fname)) with open(fname,'r') as inf: # Read a FASTA file? (new_headers, new_seqs) = biofile.readFASTA(inf) headers = headers + new_headers seqs = seqs + new_seqs if not os.path.isfile(options.in_names_fname): raise IOError("# Error: file {} does not exist".format(options.in_names_fname)) with open(options.in_names_fname,'r') as inf: species = util.readTable(inf, header=True) def shorten(x):
import sys, os, math, string import biofile if __name__=='__main__': (h,s) = biofile.readFASTA('test-biofile/test-biofile-001.fa') assert len(h) == 143 cd = biofile.readFASTADict(os.path.expanduser('test-biofile/test-biofile-001.fa')) assert len(cd.keys()) == len(h)
optdict = vars(options) for (k,v) in sorted(optdict.items()): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) def secondField(h): f = None try: f = biofile.secondField(h) except: f = biofile.firstField(h) return f prot_dict = {} # Read input if not os.path.isfile(options.in_fasta_fname): raise IOError("# Error: file {} does not exist".format(options.in_fasta_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fasta_fname, 'r')) all_keys = [biofile.firstField(h) for h in headers] if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] zhs = [(h,s) for (h,s) in zip(headers,seqs) if not s is None] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h,s) in zhs]) gene_orf_dict = dict([(secondField(h), biofile.firstField(h)) for h in headers]) orf_gene_dict = dict([(o,g) for (g,o) in gene_orf_dict.items()]) query_keys = [] if not options.query_orf is []: # Specific ORF(s) query_keys += options.query_orf if not options.query_gene is []: # Specific gene(s)
# Set up some output info_outs = util.OutStreams(sys.stdout) outs = util.OutStreams() params_outs = util.OutStreams([outs]) if not options.out_fname is None: outf = open(os.path.expanduser(options.out_fname),'w') outs.addStream(outf) else: outs.addStream(sys.stdout) orf_dict = None gene_orf_map = None if not options.fasta_fname is None: fname = os.path.expanduser(options.fasta_fname) (headers, sequences) = biofile.readFASTA(fname) orf_dict = dict(zip([biofile.firstField(h) for h in headers], sequences)) gene_orf_map = dict([(biofile.secondField(h), biofile.firstField(h)) for h in headers]) # Set the weight matrix try: matrix = motif.weight_matrices[options.pssm_name] except KeyError as ke: outs.write("# Unable to find weight matrix {}; try one of {}\n".format(options.pssm_name, ','.join(motif.weight_matrices.keys()))) window_size = len(matrix['A']) #len(matrix.values()[0]) # for associating windows with residues, center them mid_window = int(math.floor(window_size/2.0)) # Write out parameters params_outs.write("# Run started {}\n".format(util.timestamp()))
assert set(frags) == set(["VR","VRK","KT","K","T"]) print "\ttest006 passed" if __name__ == '__main__': fname = sys.argv[1] if fname == "__test__": print "Running tests..." test001() test002() test003() test004() test005() test006() print "All tests passed" sys.exit() patterns = sys.argv[2].split("/") complete = True if os.path.isfile(os.path.expanduser(fname)): (headers, seqs) = biofile.readFASTA(os.path.expanduser(fname)) else: seqs = [fname] for seq in seqs: frags = digest(seq, patterns, complete) #print "\n%s\n---" % seq for f in frags: print len(f), f if complete: assert ''.join(frags) == seq
if not options.isolate_out_fname is None: isolate_outf = file(options.isolate_out_fname,'w') isolate_outs.addStream(isolate_outf) # Write out parameters params_outs.write("# Run started {}\n".format(util.timestamp())) params_outs.write("# Command: {}\n".format(' '.join(sys.argv))) params_outs.write("# Parameters:\n") optdict = vars(options) for (k,v) in optdict.items(): params_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fname, 'r')) #, key_fxn=biofile.secondField) if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] zhs = [(h,s) for (h,s) in zip(headers,seqs) if not s is None] all_keys = [biofile.firstField(h) for (h,s) in zhs] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h,s) in zhs]) gene_orf_dict = dict([(biofile.secondOrFirstField(h), biofile.firstField(h)) for h in headers]) orf_gene_dict = dict([(v,k) for (k,v) in gene_orf_dict.items()]) # Select which genes to process query_keys = [] if not options.query_orf is []: # Specific ORF(s) query_keys += options.query_orf if not options.query_gene is []:
fasta_outs.addStream(sys.stdout) # Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.in_fname,'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(seqs))) new_headers = headers new_seqs = seqs for append_fname in options.append_fasta: if not os.path.isfile(append_fname): raise IOError("# Error: file {} does not exist".format(append_fname)) with open(append_fname,'r') as inf: # Read a FASTA file? (app_headers, app_seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(app_seqs))) assert len(app_seqs) == len(new_seqs) if options.check_headers: for (h1, h2) in zip(headers, app_headers): assert h1==h2, "# Error: headers do not match:\n\t{}\n\t{}".format(h1,h2)
fasta_outs.addStream(sys.stdout) # Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.in_fname,'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(seqs))) # Find query sequence(s) query_ids = [] for (xi, h) in enumerate(headers): if options.query in h: query_ids.append(xi) if len(query_ids) == 0: info_outs.write("# Could not find sequences '{}'; exiting\n".format(options.query)) sys.exit() if len(query_ids) > 1: info_outs.write("# Found more than one sequence matching '{}'; using the first one: \n#\t{}\n".format(options.query, headers[xi])) # Pick the first one query_id = query_ids[0]
parser.add_argument("in_fname", help="input filename") parser.add_argument("-p", "--path", dest="muscle_path", default=const_default_muscle_exepath, help="path to Muscle binary") parser.add_argument("-t", "--translate", dest="translate", action="store_true", default=False, help="translate the input sequences?") parser.add_argument("-o", "--out", dest="out_fname", default=None, help="output filename") options = parser.parse_args() outs = util.OutStreams() if not options.out_fname is None: fname = os.path.expanduser(options.out_fname) #print fname outf = open(fname,'w') outs.addStream(outf) else: outs.addStream(sys.stdout) (headers, seqs) = biofile.readFASTA(open(options.in_fname,'r')) seqs_to_align = seqs if options.translate: seqs_to_align = [translate.translate(s) for s in seqs] alseqs = alignSequences(seqs_to_align, exepath=options.muscle_path) #print alseqs if options.translate: alseqs = [alignGeneFromProtein(g, s) for (g,s) in zip(seqs,alseqs)] for (h,s) in zip(headers,alseqs): outs.write(">{}\n{}\n".format(h,s)) if not options.out_fname is None: outf.close()
data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) def secondField(h): f = None try: f = biofile.secondField(h) except: f = biofile.firstField(h) return f prot_dict = {} # Read input if not os.path.isfile(options.in_fasta_fname): raise IOError("# Error: file {} does not exist".format( options.in_fasta_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fasta_fname, 'r')) all_keys = [biofile.firstField(h) for h in headers] if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] zhs = [(h, s) for (h, s) in zip(headers, seqs) if not s is None] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h, s) in zhs]) gene_orf_dict = dict([(secondField(h), biofile.firstField(h)) for h in headers]) orf_gene_dict = dict([(o, g) for (g, o) in gene_orf_dict.items()]) query_keys = [] if not options.query_orf is []: # Specific ORF(s) query_keys += options.query_orf if not options.query_gene is []: