print "failed to handle Genbank file" break else: print "...", seq_format = 'gbk' elif filename.find(".fas") > 0: # process fasta (for mfas, load first record) try: record = load_fasta(seq_dir+filename) except IOError: print "failed to load Fasta file as single-record file" break except Exception: try: record = load_multifasta(seq_dir+filename)[0] except IOError: print "failed to load Fasta file as multi-record file" break except Exception: print "failed to handle Fasta file" break print "...", seq_format = 'fas' else: # reject as bad format print "invalid file format" break if len(record) < int(min_size):
# script to translate sequences in multifasta files into proteins from sys import argv from libs.common import load_multifasta, write_fasta from Bio.SeqRecord import SeqRecord origin_dir = "data/" + argv[1] + "/" in_file = origin_dir + argv[2] outfile = in_file[:-4] + "_aa.fas" proteins = [] for record in load_multifasta(in_file): aa_rec = SeqRecord(id=record.id, seq=record.seq.translate()) proteins.append(aa_rec) write_fasta(outfile, proteins)
data_dir = "data/"+argv[1]+"/" dir_in = data_dir+argv[2]+"/" infile = data_dir+argv[3] # must be a fasta file with query sequences file_ext = argv[4] blast_mode = argv[5] if len(argv) > 5: blast_mode = argv[5] else: blast_mode = 'n' # nucleotide blast by default blast_out = data_dir+"blast_out/" ensure_dir([blast_out]) queries = load_multifasta(infile) filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext)) for filename in filenames: rec_name = filename[:filename.find("."+file_ext)] print rec_name, genome_path = dir_in+filename dbfile_path = "data/blast_db/"+rec_name while True: if not path.exists(dbfile_path+".nhr"): if file_ext == 'gbk': try:
# script to rename contigs in multifasta files <<<<<<< HEAD from genomes import all as genomes from libs.common import load_multifasta, write_fasta for genome in genomes: print genome['file'] file_path = "data/genomes/"+genome['file'] outfile_path = "data/renamed/"+genome['file'] contigs = load_multifasta(file_path) renamed = [] counter = 1 for contig in contigs: contig.id = genome['name']+"_"+str(counter) contig_path = "data/contigs/"+contig.id+".fas" write_fasta(contig_path, contig) renamed.append(contig) counter +=1 write_fasta(outfile_path, renamed) ======= from sys import argv from libs.common import load_multifasta, write_fasta, ensure_dir from genomes import all as genome_list origin_dir = "data/"+argv[1]+"/" destin_dir = "data/"+argv[2]+"/" ensure_dir([destin_dir])
data_dir = "data/" + argv[1] + "/" dir_in = data_dir + argv[2] + "/" infile = data_dir + argv[3] # must be a fasta file with query sequences file_ext = argv[4] blast_mode = argv[5] if len(argv) > 5: blast_mode = argv[5] else: blast_mode = 'n' # nucleotide blast by default blast_out = data_dir + "blast_out/" ensure_dir([blast_out]) queries = load_multifasta(infile) filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext)) for filename in filenames: rec_name = filename[:filename.find("." + file_ext)] print rec_name, genome_path = dir_in + filename dbfile_path = "data/blast_db/" + rec_name while True: if not path.exists(dbfile_path + ".nhr"): if file_ext == 'gbk': try:
import re from sys import argv from libs.common import load_multifasta, from_dir import matplotlib.pyplot as plt import numpy as np data_dir = "data/" + argv[1] filenames = from_dir(data_dir, re.compile(r'.*\.fas.*')) ctg_ns = [] n50s = [] for filename in filenames: # load contigs from file contig_list = load_multifasta(data_dir + "/" + filename) # count contigs ctg_count = len(contig_list) if ctg_count < 200: ctg_ns.append(ctg_count) else: ctg_ns.append(200) # sort contig list by size contig_list.sort(key=len) contig_list.reverse() # count full sequence length full_seq_length = 0 for contig in contig_list: full_seq_length += len(contig.seq)
print "failed to handle Genbank file" break else: print "...", seq_format = 'gbk' elif filename.find(".fas") > 0: # process fasta (for mfas, load first record) try: record = load_fasta(seq_dir + filename) except IOError: print "failed to load Fasta file as single-record file" break except Exception: try: record = load_multifasta(seq_dir + filename)[0] except IOError: print "failed to load Fasta file as multi-record file" break except Exception: print "failed to handle Fasta file" break print "...", seq_format = 'fas' else: # reject as bad format print "invalid file format" break if len(record) < int(min_size):
# script to translate sequences in multifasta files into proteins from sys import argv from libs.common import load_multifasta, write_fasta from Bio.SeqRecord import SeqRecord origin_dir = "data/"+argv[1]+"/" in_file = origin_dir+argv[2] outfile = in_file[:-4]+"_aa.fas" proteins = [] for record in load_multifasta(in_file): aa_rec = SeqRecord(id=record.id, seq=record.seq.translate()) proteins.append(aa_rec) write_fasta(outfile, proteins)
else: gbk_file = origin_dir+"/"+filename fas_file = gbk2fas(gbk_file) record = load_genbank(gbk_file) # run prediction annot_aa = annot_aa_dir+rec_name+"_ann.fas" annot_gbk = annot_gbk_dir+rec_name+"_ann.gbk" if not path.exists(trn_file): train_prodigal(fas_file, trn_file, "-q") if not path.exists(annot_aa): run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q") # collect orfs record.features = [] aa_record = load_multifasta(annot_aa) counter = 1 for aa_rec in aa_record: this_prot = rec_name+"_"+str(counter) # get feature details from description line # because prodigal output fails to load as valid genbank defline = aa_rec.description pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+') match = pattern.match(defline) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) l_tag = rec_name+"_"+str(counter) # consolidation feature annotations quals = {'note': defline, 'locus_tag': l_tag,
try: records = [load_genbank(origin_file)] except IOError: print "failed to load file" break elif genome['input'] == 'fas': try: records = [load_fasta(origin_file)] except IOError: print "failed to load file" break elif genome['input'] == 'mfas': try: records = load_multifasta(origin_file) except IOError: print "failed to load file" break else: print "input not recognized" break for record in records: try: write_fasta(destin_dir+record.id+".fas", record) except Exception: print "failed to write contig file" break else:
import re from sys import argv from libs.common import load_multifasta, from_dir import matplotlib.pyplot as plt import numpy as np data_dir = "data/"+argv[1] filenames = from_dir(data_dir, re.compile(r'.*\.fas.*')) ctg_ns = [] n50s = [] for filename in filenames: # load contigs from file contig_list = load_multifasta(data_dir+"/"+filename) # count contigs ctg_count = len(contig_list) if ctg_count < 200: ctg_ns.append(ctg_count) else: ctg_ns.append(200) # sort contig list by size contig_list.sort(key=len) contig_list.reverse() # count full sequence length full_seq_length = 0 for contig in contig_list: full_seq_length += len(contig.seq)
record = load_genbank(gbk_file) assert record.id # run prediction annot_aa = annot_aa_dir+rec_name+"_ann.fas" annot_gbk = annot_gbk_dir+rec_name+"_ann.gbk" if not path.exists(trn_file): train_prodigal(fas_file, trn_file, "-q") if not path.exists(annot_aa): run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q") # blast the protein sequences against the remote DB record.features = [] evalue = 0.01 proteins = load_multifasta(annot_aa) for protein in proteins: print " ", protein.id rec_hits_dir = hits_dir+rec_name+"/" ensure_dir([rec_hits_dir]) hits_out = open(rec_hits_dir+protein.id+".txt", 'w') hits_out.write(" ".join([protein.id, "vs.", remote_prot_db, "@evalue =", str(evalue), "\n"])) temp_out = remote_blastp_2file(protein.seq, remote_prot_db, blast_dir+rec_name+"_temp.xml", evalue) #temp_out = blast_dir+rec_name+"_temp.xml" # collect best 10 hits rec_hits = collect_topNhits(temp_out, 10) for hit in rec_hits: if hasattr(hit, 'hsps'):
record = load_genbank(gbk_file) assert record.id # run prediction annot_aa = annot_aa_dir + rec_name + "_ann.fas" annot_gbk = annot_gbk_dir + rec_name + "_ann.gbk" if not path.exists(trn_file): train_prodigal(fas_file, trn_file, "-q") if not path.exists(annot_aa): run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q") # blast the protein sequences against the remote DB record.features = [] evalue = 0.01 proteins = load_multifasta(annot_aa) for protein in proteins: print " ", protein.id rec_hits_dir = hits_dir + rec_name + "/" ensure_dir([rec_hits_dir]) hits_out = open(rec_hits_dir + protein.id + ".txt", 'w') hits_out.write(" ".join([ protein.id, "vs.", remote_prot_db, "@evalue =", str(evalue), "\n" ])) temp_out = remote_blastp_2file(protein.seq, remote_prot_db, blast_dir + rec_name + "_temp.xml", evalue) #temp_out = blast_dir+rec_name+"_temp.xml" # collect best 10 hits rec_hits = collect_topNhits(temp_out, 10)