def annot_ref(ref_name, ctg_fas, prot_db_name, fixed_dirs, project_id, blast_prefs): """Annotate reference contig (predict ORFs and assign function).""" # locate the COG database prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name # set inputs and outputs g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']+ref_name+"/" ctg_cds_root = fixed_dirs['ctg_cds_dir']+ref_name+"/" ctg_prot_root = fixed_dirs['ctg_prot_dir']+ref_name+"/" ctg_blast_root = fixed_dirs['ctg_blast_dir']+ref_name+"/" annot_trn_root = fixed_dirs['annot_trn_dir'] ensure_dir([g_gbk_ctgs_root, ctg_cds_root, ctg_prot_root, ctg_blast_root, annot_trn_root]) trn_file = annot_trn_root+ref_name+"_annot.trn" g_ctg_gbk = g_gbk_ctgs_root+ref_name+"_1.gbk" annot_gbk = ctg_cds_root+ref_name+"_1_cds.gbk" annot_aa = ctg_prot_root+ref_name+"_1_aa.fas" blast_out = ctg_blast_root+ref_name+"_1.xml" if path.exists(blast_out) and os.stat(blast_out)[6]==0: os.remove(blast_out) if not path.exists(g_ctg_gbk): l_tag_base = ref_name+"_1" record = annot_ctg(ctg_fas, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db, blast_out, l_tag_base, blast_prefs) record.description = ref_name+"_re-annotated" record.name = ref_name+"_1" record.dbxrefs = ["Project: "+project_id+"/"+ref_name +"-like backbones"] record.seq.alphabet = generic_dna write_genbank(g_ctg_gbk, record) else: record = load_genbank(g_ctg_gbk) return record
def fas2gbk(fas_file): """Convert a FastA file to Genbank format.""" record = load_fasta(fas_file) gbk_file = fas_file[:fas_file.find('.fas')]+'.gbk' # record.name = rec_name # record.id = rec_name record.seq.alphabet = generic_dna write_genbank(gbk_file, record) return gbk_file
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/" mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root + g_name + "/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root + g_name + "/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas" scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir + ctg_num + ".mauve" bb_file = mauve_file + ".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert( anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir + g_name + "_" + str( ctg_num) + ".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name + " scaffold from " + ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir + run_id + "/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir'] + ref['file'] seg_out_root = run_root + run_dirs['ref_seg_dir'] + ref_name + "/" gen_fas_root = fixed_dirs['fas_contigs_dir'] + ref_name + "/" if ref_annot_flag: ref_gbk = run_root + run_dirs[ 'ref_gbk_dir'] + ref_name + "_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root + run_dirs['ref_fas_dir'] + ref_name + ".fas" genome_fas = gen_fas_root + ref_name + "_1.fas" report_root = run_root + run_dirs['reports'] + ref_name + "/" ref_log = report_root + run_id + "_" + ref_name + "_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for " + ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir+run_id+"/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir']+ref['file'] seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/" gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/" if ref_annot_flag: ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas" genome_fas = gen_fas_root+ref_name+"_1.fas" report_root = run_root+run_dirs['reports']+ref_name+"/" ref_log = report_root+run_id+"_"+ref_name+"_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for "+ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root+g_name+"/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas" scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "("+ctg_num+")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "("+ctg_num+")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir+ctg_num+".mauve" bb_file = mauve_file+".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert(anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name+" scaffold from "+ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def annot_genome_contigs(run_ref, prot_db_name, fixed_dirs, r_root_dir, run_id, run_dirs, genomes, project_id, timestamp, blast_prefs): """Annotate genome contigs (predict ORFs and assign function).""" # locate the COG database prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name # TODO: add other DB / pfams? # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" fas_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/" ctg_cds_root = fixed_dirs['ctg_cds_dir'] ctg_prot_root = fixed_dirs['ctg_prot_dir'] ctg_blast_root = fixed_dirs['ctg_blast_dir'] g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir'] r_gbk_ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" annot_trn_root = fixed_dirs['annot_trn_dir'] print " ", ref_n # log logstring = "".join(["\n\n# Annotate genome contigs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] fas_ctgs_dir = fas_ctgs_root+g_name+"/" g_file = fixed_dirs['ori_g_dir']+genome['file'] print '\t', g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set output files training_file = annot_trn_root+g_name+"_annot.trn" # set output dirs ctg_cds_dir = ctg_cds_root+g_name+"/" ctg_prot_dir = ctg_prot_root+g_name+"/" ctg_blast_dir = ctg_blast_root+g_name+"/" g_gbk_ctgs_dir = g_gbk_ctgs_root+g_name+"/" r_gbk_ctgs_dir = r_gbk_ctgs_root+g_name+"/" ensure_dir([ctg_cds_dir, ctg_prot_dir, ctg_blast_dir, g_gbk_ctgs_dir, r_gbk_ctgs_dir]) # list fasta files in matches directory dir_contents = listdir(fas_ctgs_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs ctg_fas = fas_ctgs_dir+item g_ctg_gbk = g_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk" r_ctg_gbk = r_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk" annot_gbk = ctg_cds_dir+g_name+"_"+ctg_num+"_cds.gbk" annot_aa = ctg_prot_dir+g_name+"_"+ctg_num+"_aa.fas" blast_out = ctg_blast_dir+g_name+"_"+ctg_num+".xml" if path.exists(blast_out) and os.stat(blast_out)[6]==0: os.remove(blast_out) if not path.exists(r_ctg_gbk): if not path.exists(g_ctg_gbk): l_tag_base = g_name+"_"+ctg_num record = annot_ctg(g_file, ctg_fas, annot_gbk, annot_aa, training_file, prot_db, blast_out, l_tag_base, blast_prefs) record.description = g_name+"_"+ctg_num record.name = g_name+"_"+ctg_num record.dbxrefs = ["Project: "+project_id+"/"+ref_n +"-like backbones"] record.seq.alphabet = generic_dna write_genbank(g_ctg_gbk, record) copyfile(g_ctg_gbk, r_ctg_gbk) print ""
def batch_contig_annot(dataset): """Extract and annotate contigs.""" # identify dataset contig file contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa' # locate the COG database cog_db = dirs['blast_db_dir']+'Cog_LE/Cog' # make the training file training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn' #train_prodigal(contigs_file, training_file) # set output dirs fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/' gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/' aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/' blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/' solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/' maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/' ensure_dir(fas_out_dir) ensure_dir(gbk_out_dir) ensure_dir(aa_out_dir) ensure_dir(blast_out_dir) ensure_dir(solid_out_dir) # set phage hit collector contig_hits = {} sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_kw_hits.html' all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_all_hits.html' sp_hit_list_handle = open(sp_hit_list, 'w') all_hit_list_handle = open(all_hit_list, 'w') sp_hit_list_handle.write("<ul>") all_hit_list_handle.write("<ul>") # load all contigs contigs_list = load_multifasta(contigs_file) # cycle through contigs ctg_count = 0 gene_count = 0 for contig in contigs_list: ctg_count +=1 # use regex to acquire relevant record ID info pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)') match = pattern.match(contig.id) nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3) contig.id = nick fasta_out = fas_out_dir+nick+'.fas' # write record to file write_fasta(fasta_out, contig) # create contig entry in dict contig_hits[nick] = [] # run the annotation annot_gbk = gbk_out_dir+nick+'.gbk' annot_aa = aa_out_dir+nick+'.fas' #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file) # blast the amino acids against COG print '\tblasting', dataset['f_nick'], nick blast_out = blast_out_dir+nick+'.xml' if path.isfile(blast_out): print "\t\talready blasted" else: local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) map_file = maps_out_dir+nick+'.pdf' # consolidate annotated genbank file record = load_fasta(fasta_out) aa_defs = load_multifasta(annot_aa) features = [] counter = 1 ctg_flag_1 = 0 ctg_flag_2 = 0 for protein in aa_defs: gene_count +=1 # get feature details from description line # necessary because the prodigal output is not parser-friendly pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)') match = pattern.match(protein.description) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) annotation = rec_cogs['Query_'+str(counter)] if ctg_flag_1 is 0: all_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_1 = 1 all_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") # detect phage content in annotation phi_pattern = re.compile(r".+(COG\d+).+" "(phage|capsid|muramidase|tail|" "replication|helicase|polymerase|" "integrase|recombinase" "suppressor|hydrolase|transposase).+", re.IGNORECASE) phi_match = phi_pattern.match(annotation) if phi_match: hit_flag = 'on' hit_dict = {'CDS': counter, 'annot': annotation, 'COGs': phi_match.group} contig_hits[nick].append(hit_dict) # write out to summary file if ctg_flag_2 is 0: sp_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_2 = 1 sp_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") else: hit_flag = 'off' # consolidation feature annotations quals = {'note': protein.description, 'fct': annotation, 'flag': hit_flag} feature = SeqFeature(location=feat_loc, strand=strand_pos, id=protein.id, type='CDS', qualifiers=quals) features.append(feature) counter +=1 record.features = features record.description = dataset['f_nick']+'_contig_'+nick record.name = nick record.dbxrefs = ['Project:np1'] record.seq.alphabet = generic_dna gbk_out = solid_out_dir+nick+'.gbk' write_genbank(gbk_out, record) # generate graphical map ContigDraw(nick, gbk_out, map_file) sp_hit_list_handle.write("</ul>") all_hit_list_handle.write("</ul>") sp_hit_list_handle.close() all_hit_list_handle.close() print "\t", gene_count, "predicted genes in", ctg_count, "contigs"