def fas2gbk(fas_file): """Convert a FastA file to Genbank format.""" record = load_fasta(fas_file) gbk_file = fas_file[:fas_file.find('.fas')]+'.gbk' # record.name = rec_name # record.id = rec_name record.seq.alphabet = generic_dna write_genbank(gbk_file, record) return gbk_file
def __init__(self, genome, seq_dir): self.name = genome['name'] self.fas = None self.gbk = None self.offset = genome['offset'] self.nudge = genome['nudge']+1 self.invert = False if genome['input'] == 'fas': self.fas = seq_dir+genome['file'] self.gbk = fas2gbk(seq_dir+genome['file']) elif genome['input'] == 'gbk': self.gbk = seq_dir+genome['file'] self.fas = gbk2fas(seq_dir+genome['file']) else: print "ERROR in input format: FastA or Genbank required" sys.exit() self.len = len(load_fasta(self.fas).seq)
def annot_ctg(g_file, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db, blast_out, l_tag_base, blast_prefs): """Do functional annotation of contig from Fasta file, return record.""" # gene prediction if not path.exists(trn_file): train_prodigal(g_file, trn_file, "-q") if not path.exists(annot_aa): run_prodigal(ctg_fas, annot_gbk, annot_aa, trn_file, "-q") # blast the amino acids against COG if not path.exists(blast_out): local_blastp_2file(annot_aa, prot_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) # consolidate annotated genbank file record = load_fasta(ctg_fas) record.features = [] aa_record = load_multifasta(annot_aa) counter = 1 for aa_rec in aa_record: this_prot = 'Query_'+str(counter) annotation = rec_cogs[this_prot] # get feature details from description line # because prodigal output fails to load as valid genbank defline = aa_rec.description pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+') match = pattern.match(defline) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) l_tag = l_tag_base+"_"+str(counter) # consolidation feature annotations quals = {'note': defline, 'locus_tag': l_tag, 'fct': annotation, 'translation': aa_rec.seq} feature = SeqFeature(location=feat_loc, strand=strand_pos, id='cds_'+str(counter), type='CDS', qualifiers=quals) record.features.append(feature) counter +=1 return record
def __init__(self, genome, seq_dir): self.name = genome['name'] self.fas = None self.gbk = None self.offset = genome['offset'] self.nudge = genome['nudge'] + 1 self.invert = False self.dir = seq_dir + genome['cat'] + '/' if genome['input'] == 'fas': self.fas = self.dir + genome['file'] self.gbk = fas2gbk(self.dir + genome['file']) elif genome['input'] == 'gbk': self.gbk = self.dir + genome['file'] self.fas = gbk2fas(self.dir + genome['file']) else: print "ERROR in input format: FastA or Genbank required" sys.exit() self.len = len(load_fasta(self.fas).seq)
def basic_batch_blast(genomes, run_ref, blast_mode, r_root_dir, run_dirs, fixed_dirs, blast_prefs, run_id, timestamp): """Send batch jobs to Blast. Muxes to multiple reference DBs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" in_root = run_root+run_dirs['ref_seg_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Blast segs to genomes @", timestamp, "\n\n"]) run_ref.log(logstring) # do blast for seg in run_ref.segs: input_file = in_root+ref_n+"_"+seg['name']+".fas" # translate if required if blast_mode == 'tn': record = load_fasta(input_file) record.seq = record.seq.translate() input_file = in_root+ref_n+"_"+seg['name']+"_aa.fas" # substitute write_fasta(input_file, record) out_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg['name']+"/" ensure_dir([out_dir]) print "\t", seg['name'], for genome in genomes: g_name = genome['name'] db_path = fixed_dirs['blast_db_dir']+g_name outfile = out_dir+g_name+"_out.txt" print ".", if blast_mode == 'n': local_blastn_2file(input_file, db_path, outfile, blast_prefs) elif blast_mode == 'tx': local_tblastx_2file(input_file, db_path, outfile, blast_prefs) elif blast_mode == 'tn': local_tblastn_2file(input_file, db_path, outfile, blast_prefs) print "" run_ref.log("All OK") return "OK"
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" match_root = run_root+run_dirs['match_out_dir']+ref_n+"/" capture_root = run_root+run_dirs['capture_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/" capture_dir = capture_root+"/"+seg_n+"/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root+g_name+"/" ensure_dir([matches_dir]) blast_infile = blast_dir+g_name+"_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir+g_name+"_1.fas", matches_dir+g_name+".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop-q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length>min_match and score>min_score and idp>min_idp: print "+", p_cnt +=1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'('+contig_id+')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir+match.group(1)+".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir+item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir+contig_id+".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start-capture_span c_stop = q_stop+capture_span else: c_start = q_stop-capture_span c_stop = q_start+capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir+g_name+"_"+contig_id+".fas" cxt_rec = SeqRecord(id=contig_id+"_" +str(c_start)+"_" +str(c_stop), seq=contig_rec.seq [c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt +=1 if n_cnt > 0: logstring = "".join(["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" match_root = run_root + run_dirs['match_out_dir'] + ref_n + "/" capture_root = run_root + run_dirs['capture_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root + run_dirs[ 'blast_out_dir'] + ref_n + "/" + seg_n + "/" capture_dir = capture_root + "/" + seg_n + "/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root + g_name + "/" ensure_dir([matches_dir]) blast_infile = blast_dir + g_name + "_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir + g_name + "_1.fas", matches_dir + g_name + ".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop - q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length > min_match and score > min_score and idp > min_idp: print "+", p_cnt += 1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'(' + contig_id + ')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir + match.group( 1) + ".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir + item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir + contig_id + ".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start - capture_span c_stop = q_stop + capture_span else: c_start = q_stop - capture_span c_stop = q_start + capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir + g_name + "_" + contig_id + ".fas" cxt_rec = SeqRecord( id=contig_id + "_" + str(c_start) + "_" + str(c_stop), seq=contig_rec.seq[c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt += 1 if n_cnt > 0: logstring = "".join( ["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ref_ctg_file = run_ref.file mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/contigs/" q_ctgs_root = run_root + run_dirs['match_out_dir'] + ref_n + "/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root + g_name + "/" mauve_dir = mauve_root + g_name + "/" aln_segs_root = segments_root + g_name + "/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir + item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir + ctg_num + ".mauve" aln_segs_dir = aln_segs_root + ctg_num + "/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir + ctg_num + "_" + ref_n + "_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/" q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root+g_name+"/" mauve_dir = mauve_root+g_name+"/" aln_segs_root = segments_root+g_name+"/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir+item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir+ctg_num+".mauve" aln_segs_dir = aln_segs_root+ctg_num+"/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""
def filter_contigs(run_ref, run_id, genomes, norm_matches, seg_size, threshold, r_root_dir, run_dirs, fixed_dirs, timestamp): """Filter contigs.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" fas_root = fixed_dirs['fas_contigs_dir'] report_root = run_root+run_dirs['reports']+ref_n+"/" ensure_dir([report_root]) print " ", ref_n # log logstring = "".join(["\n\n# Filter contigs @", timestamp, "\n"]) run_ref.log(logstring) # process # evaluate segment specificity using negative controls neg_controls = [genome['name'] for genome in genomes if ('ctrl' in genome.keys() and genome['ctrl'] == 'neg')] neg_dat = [norm_matches[g_name]['ctg_scores'] for g_name in neg_controls] neg_RA = np.vstack(neg_dat) neg_mean = nanmean(neg_RA, axis=0) # process the genomes we're testing test_genomes = [genome['name'] for genome in genomes if not ('ctrl' in genome.keys())] for g_name in test_genomes: print "\t", g_name, ctg_hits = norm_matches[g_name]['ctg_scores'] ctg_stats = {} #process individual contigs counter = 0 for ctg_RA in ctg_hits: # identify this contig by name ctg_name = norm_matches[g_name]['ctg_names'][counter] counter += 1 # subtract background signal from match scores recal_ctg_RA = np.subtract(ctg_RA, neg_mean) recal_ctg_RA = recal_ctg_RA.clip(min=0) # compute total similarity score s_score = np.sum(recal_ctg_RA) # compute clustering score (primitive) streak = False c_score = 0 for hit in recal_ctg_RA: if hit == 0: if streak == True: c_score += -1 streak = False else: c_score += 0 elif hit > 0: if streak == True: c_score += 2 else: c_score += 1 streak = True # compute backbone vs. cargo burden ctg_rec = load_fasta(fas_root+g_name+"/"+ctg_name+".fas") bbone = np.sum(np.ma.make_mask(recal_ctg_RA))*seg_size if bbone > len(ctg_rec): bbone = len(ctg_rec) # workaround for last segment being always a little short cargo = len(ctg_rec) - bbone # make inverted array mask (used for redundancy detection) ctg_mask = np.ma.getmaskarray(np.ma.masked_equal(recal_ctg_RA,0)) # consolidate contig information ctg_stats[ctg_name] = {'s_score': s_score, 'c_score': c_score, 'vector': recal_ctg_RA, 'inv_mask':ctg_mask, 'bbone': bbone, 'cargo': cargo} # detect redundant contigs ### use np.ma.mask_or(m1, m2) ### if any elements returns false there is a redundancy between two contigs ### if so evaluate which has better c_score and s_score # compute overall stats for the genome gs_score = sum([ctg_stats[contig]['s_score'] for contig in ctg_stats]) gc_score = sum([ctg_stats[contig]['c_score'] for contig in ctg_stats]) g_bbone = sum([ctg_stats[contig]['bbone'] for contig in ctg_stats]) g_cargo = sum([ctg_stats[contig]['cargo'] for contig in ctg_stats]) print gs_score, gc_score, g_bbone, g_cargo, # if gs_score > threshold: ## run plotters again ## pass the genome on to the next step (others will be dropped) print "MATCH" else: print "(-)"
def batch_contig_annot(dataset): """Extract and annotate contigs.""" # identify dataset contig file contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa' # locate the COG database cog_db = dirs['blast_db_dir']+'Cog_LE/Cog' # make the training file training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn' #train_prodigal(contigs_file, training_file) # set output dirs fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/' gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/' aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/' blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/' solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/' maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/' ensure_dir(fas_out_dir) ensure_dir(gbk_out_dir) ensure_dir(aa_out_dir) ensure_dir(blast_out_dir) ensure_dir(solid_out_dir) # set phage hit collector contig_hits = {} sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_kw_hits.html' all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_all_hits.html' sp_hit_list_handle = open(sp_hit_list, 'w') all_hit_list_handle = open(all_hit_list, 'w') sp_hit_list_handle.write("<ul>") all_hit_list_handle.write("<ul>") # load all contigs contigs_list = load_multifasta(contigs_file) # cycle through contigs ctg_count = 0 gene_count = 0 for contig in contigs_list: ctg_count +=1 # use regex to acquire relevant record ID info pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)') match = pattern.match(contig.id) nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3) contig.id = nick fasta_out = fas_out_dir+nick+'.fas' # write record to file write_fasta(fasta_out, contig) # create contig entry in dict contig_hits[nick] = [] # run the annotation annot_gbk = gbk_out_dir+nick+'.gbk' annot_aa = aa_out_dir+nick+'.fas' #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file) # blast the amino acids against COG print '\tblasting', dataset['f_nick'], nick blast_out = blast_out_dir+nick+'.xml' if path.isfile(blast_out): print "\t\talready blasted" else: local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) map_file = maps_out_dir+nick+'.pdf' # consolidate annotated genbank file record = load_fasta(fasta_out) aa_defs = load_multifasta(annot_aa) features = [] counter = 1 ctg_flag_1 = 0 ctg_flag_2 = 0 for protein in aa_defs: gene_count +=1 # get feature details from description line # necessary because the prodigal output is not parser-friendly pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)') match = pattern.match(protein.description) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) annotation = rec_cogs['Query_'+str(counter)] if ctg_flag_1 is 0: all_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_1 = 1 all_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") # detect phage content in annotation phi_pattern = re.compile(r".+(COG\d+).+" "(phage|capsid|muramidase|tail|" "replication|helicase|polymerase|" "integrase|recombinase" "suppressor|hydrolase|transposase).+", re.IGNORECASE) phi_match = phi_pattern.match(annotation) if phi_match: hit_flag = 'on' hit_dict = {'CDS': counter, 'annot': annotation, 'COGs': phi_match.group} contig_hits[nick].append(hit_dict) # write out to summary file if ctg_flag_2 is 0: sp_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_2 = 1 sp_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") else: hit_flag = 'off' # consolidation feature annotations quals = {'note': protein.description, 'fct': annotation, 'flag': hit_flag} feature = SeqFeature(location=feat_loc, strand=strand_pos, id=protein.id, type='CDS', qualifiers=quals) features.append(feature) counter +=1 record.features = features record.description = dataset['f_nick']+'_contig_'+nick record.name = nick record.dbxrefs = ['Project:np1'] record.seq.alphabet = generic_dna gbk_out = solid_out_dir+nick+'.gbk' write_genbank(gbk_out, record) # generate graphical map ContigDraw(nick, gbk_out, map_file) sp_hit_list_handle.write("</ul>") all_hit_list_handle.write("</ul>") sp_hit_list_handle.close() all_hit_list_handle.close() print "\t", gene_count, "predicted genes in", ctg_count, "contigs"