set_lines = ["all = ["] filenames = from_dir(seq_dir, re.compile(r'.*\..*')) counter = 1 for filename in filenames: print filename, while True: if filename.find(".gbk") > 0: # process genbank try: record = load_genbank(seq_dir+filename) except IOError: print "failed to load Genbank file" break except Exception: print "failed to handle Genbank file" break else: print "...", seq_format = 'gbk' elif filename.find(".fas") > 0: # process fasta (for mfas, load first record) try: record = load_fasta(seq_dir+filename) except IOError:
except Exception: print "Error retrieving record" break else: if rec_id[0:2] == 'NZ': # disposition for WGS record sets print "fetching WGS dataset", # create a dedicated directory seqdir = data_dir + rec_id + "/" ensure_dir([seqdir]) # open genome record stub to get the contig count fname = data_dir + rec_id + ".gbk" try: stub = load_genbank(fname) except IOError: print "Error loading", fname break base_code = stub.annotations['wgs'][0][:10] # 7 if not NZ_ ctg_num = int(stub.annotations['wgs'][-1][10:]) # 7 records = [] # fetch contig records ctg_count = 0 while ctg_count < ctg_num: # TODO: better formatting ctg_count += 1 if ctg_count < 10:
filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext)) for filename in filenames: rec_name = filename[:filename.find("."+file_ext)] print rec_name, genome_path = dir_in+filename dbfile_path = "data/blast_db/"+rec_name while True: if not path.exists(dbfile_path+".nhr"): if file_ext == 'gbk': try: print "converting,", record = load_genbank(genome_path) except IOError: print "failed to load Genbank file" break else: try: genome_path = dir_in+rec_name+".fas" write_fasta(genome_path, record) except Exception: print "failed to write Fasta file" break try: print "making a DB,", make_blastDB(dbfile_path, genome_path, 'nucl') except IOError: print "failed to make DB"
except Exception: print "Error retrieving record" break else: if rec_id[0:2] == 'NZ': # disposition for WGS record sets print "fetching WGS dataset", # create a dedicated directory seqdir = data_dir+rec_id+"/" ensure_dir([seqdir]) # open genome record stub to get the contig count fname = data_dir+rec_id+".gbk" try: stub = load_genbank(fname) except IOError: print "Error loading", fname break base_code = stub.annotations['wgs'][0][:10] # 7 if not NZ_ ctg_num = int(stub.annotations['wgs'][-1][10:]) # 7 records = [] # fetch contig records ctg_count = 0 while ctg_count < ctg_num: # TODO: better formatting ctg_count += 1 if ctg_count < 10:
filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext)) for filename in filenames: rec_name = filename[:filename.find("." + file_ext)] print rec_name, genome_path = dir_in + filename dbfile_path = "data/blast_db/" + rec_name while True: if not path.exists(dbfile_path + ".nhr"): if file_ext == 'gbk': try: print "converting,", record = load_genbank(genome_path) except IOError: print "failed to load Genbank file" break else: try: genome_path = dir_in + rec_name + ".fas" write_fasta(genome_path, record) except Exception: print "failed to write Fasta file" break try: print "making a DB,", make_blastDB(dbfile_path, genome_path, 'nucl') except IOError: print "failed to make DB"
## script to combine several fasta files into a single one import re from sys import argv from libs.common import from_dir, load_fasta, load_genbank, write_fasta origin_dir = "data/" + argv[1] destin_file = origin_dir + "/" + argv[2] + ".fas" file_ext = argv[3] filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext)) records = [] for filename in filenames: # load record if file_ext == 'fas': records.append(load_fasta(origin_dir + "/" + filename)) elif file_ext == 'gbk': records.append(load_genbank(origin_dir + "/" + filename)) print filename write_fasta(destin_file, records)
set_lines = ["all = ["] filenames = from_dir(seq_dir, re.compile(r'.*\..*')) counter = 1 for filename in filenames: print filename, while True: if filename.find(".gbk") > 0: # process genbank try: record = load_genbank(seq_dir + filename) except IOError: print "failed to load Genbank file" break except Exception: print "failed to handle Genbank file" break else: print "...", seq_format = 'gbk' elif filename.find(".fas") > 0: # process fasta (for mfas, load first record) try: record = load_fasta(seq_dir + filename) except IOError:
trn_file = origin_dir+"prodigal.trn" ensure_dir([annot_gbk_dir, annot_aa_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext+'.*')) for filename in filenames: rec_name = filename[:filename.find("."+file_ext)] print rec_name, "...", # load data if file_ext == 'fas': fas_file = origin_dir+"/"+filename gbk_file = fas2gbk(fas_file) record = load_genbank(gbk_file) else: gbk_file = origin_dir+"/"+filename fas_file = gbk2fas(gbk_file) record = load_genbank(gbk_file) # run prediction annot_aa = annot_aa_dir+rec_name+"_ann.fas" annot_gbk = annot_gbk_dir+rec_name+"_ann.gbk" if not path.exists(trn_file): train_prodigal(fas_file, trn_file, "-q") if not path.exists(annot_aa): run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q") # collect orfs record.features = []
## script to combine several fasta files into a single one import re from sys import argv from libs.common import from_dir, load_fasta, load_genbank, write_fasta origin_dir = "data/"+argv[1] destin_file = origin_dir+"/"+argv[2]+".fas" file_ext = argv[3] filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext)) records = [] for filename in filenames: # load record if file_ext == 'fas': records.append(load_fasta(origin_dir+"/"+filename)) elif file_ext == 'gbk': records.append(load_genbank(origin_dir+"/"+filename)) print filename write_fasta(destin_file, records)
feat_tag = argv[4] feat_name = argv[5] main_out = data_dir + feat_name + "_seqs.fas" records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in + "/" + filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]: print '\nfound', feat_name, 'in', rec_name # extract sequence new_rec = feat.extract(record) new_rec.id = rec_name + '_' + feat_name new_rec.description = "Extracted from " + new_rec.description records.append(new_rec) except KeyError: pass
feat_tag = argv[4] feat_name = argv[5] main_out = data_dir+feat_name+"_seqs.fas" records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in+"/"+filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]: print '\nfound', feat_name, 'in', rec_name # extract sequence new_rec = feat.extract(record) new_rec.id = rec_name+'_'+feat_name new_rec.description = "Extracted from "+new_rec.description records.append(new_rec) except KeyError: pass
for genome in genome_list: print genome['name'], origin_file = origin_dir+genome['file'] while True: if genome['input'] == 'cgbk': print "ignoring cgbk file" break elif genome['input'] == 'gbk': try: records = [load_genbank(origin_file)] except IOError: print "failed to load file" break elif genome['input'] == 'fas': try: records = [load_fasta(origin_file)] except IOError: print "failed to load file" break elif genome['input'] == 'mfas': try: records = load_multifasta(origin_file) except IOError:
for genome in genomes: g_vector = [] print genome['name'], while True: try: assert genome['input'] == 'gbk' except ValueError: print "bad format (skipping)" break # load genome file to extract features (to proteins in mfas file) record = load_genbank(seq_dir + genome['file']) select = [ feat for feat in record.features if feat.type == feat_type ] feat_cnt = 0 # cycle through selected features for feat in select: feat_cnt += 1 rec = feat.extract(record) rec.description = genome['name'] + '_' + feat_type + '_' + str( feat_cnt) # initialize or update blast DB if init_DB:
for genome in genomes: g_vector = [] print genome['name'], while True: try: assert genome['input'] == 'gbk' except ValueError: print "bad format (skipping)" break # load genome file to extract features (to proteins in mfas file) record = load_genbank(seq_dir+genome['file']) select = [feat for feat in record.features if feat.type == feat_type] feat_cnt = 0 # cycle through selected features for feat in select: feat_cnt +=1 rec = feat.extract(record) rec.description = genome['name']+'_'+feat_type+'_'+str(feat_cnt) # initialize or update blast DB if init_DB: ref_records = [value[0] for value in symbolDB.values()] write_fasta(db_file, ref_records)