genome_path = dir_in+filename dbfile_path = "data/blast_db/"+rec_name while True: if not path.exists(dbfile_path+".nhr"): if file_ext == 'gbk': try: print "converting,", record = load_genbank(genome_path) except IOError: print "failed to load Genbank file" break else: try: genome_path = dir_in+rec_name+".fas" write_fasta(genome_path, record) except Exception: print "failed to write Fasta file" break try: print "making a DB,", make_blastDB(dbfile_path, genome_path, 'nucl') except IOError: print "failed to make DB" break try: # blastx against each genome DB outfile = blast_out+rec_name+".txt" prefs = {'evalue': 0.001, 'outfmt_pref': 6} print "blasting,", if blast_mode == 'n':
genome_path = dir_in + filename dbfile_path = "data/blast_db/" + rec_name while True: if not path.exists(dbfile_path + ".nhr"): if file_ext == 'gbk': try: print "converting,", record = load_genbank(genome_path) except IOError: print "failed to load Genbank file" break else: try: genome_path = dir_in + rec_name + ".fas" write_fasta(genome_path, record) except Exception: print "failed to write Fasta file" break try: print "making a DB,", make_blastDB(dbfile_path, genome_path, 'nucl') except IOError: print "failed to make DB" break try: # blastx against each genome DB outfile = blast_out + rec_name + ".txt" prefs = {'evalue': 0.001, 'outfmt_pref': 6} print "blasting,", if blast_mode == 'n':
# fetch contig records ctg_count = 0 while ctg_count < ctg_num: # TODO: better formatting ctg_count += 1 if ctg_count < 10: ctg_id = base_code + '0000' + str(ctg_count) elif ctg_count < 100: ctg_id = base_code + '000' + str(ctg_count) elif ctg_count < 1000: ctg_id = base_code + '00' + str(ctg_count) else: # shouldn't happen but hey... ctg_id = base_code + '0' + str(ctg_count) # fetch contig record try: fname = EFetcher(ctg_id[3:], seqdir) # 3 if not NZ_ except Exception: print "Error retrieving record" else: try: records.append(load_genbank(fname)) except Exception: print "Error loading record" write_fasta(data_dir + rec_id + ".fas", records) print "OK" break # confirm complete stop print "BatchFetcher has downloaded " + str(counter) + " records to file."
# script to translate sequences in multifasta files into proteins from sys import argv from libs.common import load_multifasta, write_fasta from Bio.SeqRecord import SeqRecord origin_dir = "data/"+argv[1]+"/" in_file = origin_dir+argv[2] outfile = in_file[:-4]+"_aa.fas" proteins = [] for record in load_multifasta(in_file): aa_rec = SeqRecord(id=record.id, seq=record.seq.translate()) proteins.append(aa_rec) write_fasta(outfile, proteins)
# fetch contig records ctg_count = 0 while ctg_count < ctg_num: # TODO: better formatting ctg_count += 1 if ctg_count < 10: ctg_id = base_code+'0000'+str(ctg_count) elif ctg_count < 100: ctg_id = base_code+'000'+str(ctg_count) elif ctg_count < 1000: ctg_id = base_code+'00'+str(ctg_count) else: # shouldn't happen but hey... ctg_id = base_code+'0'+str(ctg_count) # fetch contig record try: fname = EFetcher(ctg_id[3:], seqdir) # 3 if not NZ_ except Exception: print "Error retrieving record" else: try: records.append(load_genbank(fname)) except Exception: print "Error loading record" write_fasta(data_dir+rec_id+".fas", records) print "OK" break # confirm complete stop print "BatchFetcher has downloaded " + str(counter) + " records to file."
# create file out_handle = open(outfile, 'w') else: counter +=1 out_handle = open(outfile, 'a') out_handle.write("\t".join([str(item) for item in line])+"\n") # extract sequence to array rev_flag = False if line[8] < line[9]: q_start, q_stop = line[8]-1, line[9] rev_flag = False else: q_start, q_stop = line[9]-1, line[8] rev_flag = True master_seq = load_fasta("data/contigs_fas/"+subject+".fas") seq_bit = master_seq[q_start:q_stop] if rev_flag: seq_bit = seq_bit.reverse_complement() record = SeqRecord(id=subject+"_"+str(counter), seq=seq_bit.seq) if query not in records_dict.keys(): records_dict[query] = [record] else: records_dict[query].append(record) # write out sequences for query in records_dict.keys(): seqfile_nt = data_dir+query+"_nt.fas" write_fasta(seqfile_nt, records_dict[query])
## script to combine several fasta sequences into a single one in a specific order from sys import argv from libs.common import load_fasta, write_fasta origin_dir = "data/"+argv[1]+"/" destin_file = origin_dir+argv[2]+".fas" base_name = argv[3] # adapt this part order = [(22, 0), (4, 0), (57, 1), (43, 1), (64, 0), (18, 0), (54, 0), (36, 1), (20, 1), (2, 1), (40, 1), (17, 1), (35, 1), (38, 1), (37, 1), (55, 1), (19, 1), (47, 1), (11, 0), (46, 0), (61, 0), (41, 1), (15, 0), (1, 1), (5, 1), (6, 0), (13, 1), (8, 0), (23, 0), (16, 1), (10, 0), (60, 0), (14, 0), (42, 0), (39, 0), (48, 0), (9, 1), (21, 0), (3, 1), (58, 1), (32, 0)] filename = origin_dir+base_name+str(order[0][0])+".fas" record = load_fasta(filename) if order[0][1]: record = record.reverse_complement() for index in order[1:]: filename = origin_dir+base_name+str(index[0])+".fas" new_rec = load_fasta(filename) if index[1]: new_rec = new_rec.reverse_complement() record += new_rec record += "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" record.id = argv[2] write_fasta(destin_file, record)
outfile = data_dir + query + "_results.txt" if not path.exists(outfile): # create file out_handle = open(outfile, 'w') else: counter += 1 out_handle = open(outfile, 'a') out_handle.write("\t".join([str(item) for item in line]) + "\n") # extract sequence to array rev_flag = False if line[8] < line[9]: q_start, q_stop = line[8] - 1, line[9] rev_flag = False else: q_start, q_stop = line[9] - 1, line[8] rev_flag = True master_seq = load_fasta("data/contigs_fas/" + subject + ".fas") seq_bit = master_seq[q_start:q_stop] if rev_flag: seq_bit = seq_bit.reverse_complement() record = SeqRecord(id=subject + "_" + str(counter), seq=seq_bit.seq) if query not in records_dict.keys(): records_dict[query] = [record] else: records_dict[query].append(record) # write out sequences for query in records_dict.keys(): seqfile_nt = data_dir + query + "_nt.fas" write_fasta(seqfile_nt, records_dict[query])
rev_flag = False if line[8] < line[9]: q_start, q_stop = line[8] - 1, line[9] rev_flag = False else: q_start, q_stop = line[9] - 1, line[8] rev_flag = True c_start, c_stop = q_start - capture_span, q_stop + capture_span master_seq = load_fasta("data/contigs_fas/" + subject + ".fas") if c_start < 0: c_start = 0 if c_stop > len(master_seq.seq): c_stop = len(master_seq.seq) seq_bit = master_seq[c_start:c_stop] if rev_flag: seq_bit = seq_bit.reverse_complement() record = SeqRecord(id=subject, seq=seq_bit.seq, description=descript) records.append(record) rec_file = ctx_dir + subject + "_" + query + "_ctxt.fas" write_fasta(rec_file, record) write_fasta(main_out, records) ### TODO: modify bb tools to accept multifasta as sole input
rev_flag = False if line[8] < line[9]: q_start, q_stop = line[8]-1, line[9] rev_flag = False else: q_start, q_stop = line[9]-1, line[8] rev_flag = True c_start, c_stop = q_start-capture_span, q_stop+capture_span master_seq = load_fasta("data/contigs_fas/"+subject+".fas") if c_start < 0: c_start = 0 if c_stop > len(master_seq.seq): c_stop = len(master_seq.seq) seq_bit = master_seq[c_start:c_stop] if rev_flag: seq_bit = seq_bit.reverse_complement() record = SeqRecord(id=subject, seq=seq_bit.seq, description=descript) records.append(record) rec_file = ctx_dir+subject+"_"+query+"_ctxt.fas" write_fasta(rec_file, record) write_fasta(main_out, records) ### TODO: modify bb tools to accept multifasta as sole input
records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in + "/" + filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]: print '\nfound', feat_name, 'in', rec_name # extract sequence new_rec = feat.extract(record) new_rec.id = rec_name + '_' + feat_name new_rec.description = "Extracted from " + new_rec.description records.append(new_rec) except KeyError: pass print '' write_fasta(main_out, records)
records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in+"/"+filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]: print '\nfound', feat_name, 'in', rec_name # extract sequence new_rec = feat.extract(record) new_rec.id = rec_name+'_'+feat_name new_rec.description = "Extracted from "+new_rec.description records.append(new_rec) except KeyError: pass print '' write_fasta(main_out, records)
try: records = [load_fasta(origin_file)] except IOError: print "failed to load file" break elif genome['input'] == 'mfas': try: records = load_multifasta(origin_file) except IOError: print "failed to load file" break else: print "input not recognized" break for record in records: try: write_fasta(destin_dir+record.id+".fas", record) except Exception: print "failed to write contig file" break else: print record.id, print "OK" break
feat for feat in record.features if feat.type == feat_type ] feat_cnt = 0 # cycle through selected features for feat in select: feat_cnt += 1 rec = feat.extract(record) rec.description = genome['name'] + '_' + feat_type + '_' + str( feat_cnt) # initialize or update blast DB if init_DB: ref_records = [value[0] for value in symbolDB.values()] write_fasta(db_file, ref_records) try: make_blastDB(db_path, db_file, 'nucl') except Exception: print "failed to make blast DB" exit() init_DB = False # first go: add all features as new symbols if new_DB: sym_cnt += 1 symbol = 'N' + str(sym_cnt) rec.id = symbol symbolDB[symbol] = [rec] g_vector.append(symbol)
<<<<<<< HEAD from genomes import all as genomes from libs.common import load_multifasta, write_fasta for genome in genomes: print genome['file'] file_path = "data/genomes/"+genome['file'] outfile_path = "data/renamed/"+genome['file'] contigs = load_multifasta(file_path) renamed = [] counter = 1 for contig in contigs: contig.id = genome['name']+"_"+str(counter) contig_path = "data/contigs/"+contig.id+".fas" write_fasta(contig_path, contig) renamed.append(contig) counter +=1 write_fasta(outfile_path, renamed) ======= from sys import argv from libs.common import load_multifasta, write_fasta, ensure_dir from genomes import all as genome_list origin_dir = "data/"+argv[1]+"/" destin_dir = "data/"+argv[2]+"/" ensure_dir([destin_dir]) for genome in genome_list:
## script to combine several fasta files into a single one import re from sys import argv from libs.common import from_dir, load_fasta, load_genbank, write_fasta origin_dir = "data/" + argv[1] destin_file = origin_dir + "/" + argv[2] + ".fas" file_ext = argv[3] filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext)) records = [] for filename in filenames: # load record if file_ext == 'fas': records.append(load_fasta(origin_dir + "/" + filename)) elif file_ext == 'gbk': records.append(load_genbank(origin_dir + "/" + filename)) print filename write_fasta(destin_file, records)
# script to translate sequences in multifasta files into proteins from sys import argv from libs.common import load_multifasta, write_fasta from Bio.SeqRecord import SeqRecord origin_dir = "data/" + argv[1] + "/" in_file = origin_dir + argv[2] outfile = in_file[:-4] + "_aa.fas" proteins = [] for record in load_multifasta(in_file): aa_rec = SeqRecord(id=record.id, seq=record.seq.translate()) proteins.append(aa_rec) write_fasta(outfile, proteins)
record = load_genbank(seq_dir+genome['file']) select = [feat for feat in record.features if feat.type == feat_type] feat_cnt = 0 # cycle through selected features for feat in select: feat_cnt +=1 rec = feat.extract(record) rec.description = genome['name']+'_'+feat_type+'_'+str(feat_cnt) # initialize or update blast DB if init_DB: ref_records = [value[0] for value in symbolDB.values()] write_fasta(db_file, ref_records) try: make_blastDB(db_path, db_file, 'nucl') except Exception: print "failed to make blast DB" exit() init_DB = False # first go: add all features as new symbols if new_DB: sym_cnt +=1 symbol = 'N'+str(sym_cnt) rec.id = symbol symbolDB[symbol] = [rec] g_vector.append(symbol)