def annot_ctg(g_file, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db, blast_out, l_tag_base, blast_prefs): """Do functional annotation of contig from Fasta file, return record.""" # gene prediction if not path.exists(trn_file): train_prodigal(g_file, trn_file, "-q") if not path.exists(annot_aa): run_prodigal(ctg_fas, annot_gbk, annot_aa, trn_file, "-q") # blast the amino acids against COG if not path.exists(blast_out): local_blastp_2file(annot_aa, prot_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) # consolidate annotated genbank file record = load_fasta(ctg_fas) record.features = [] aa_record = load_multifasta(annot_aa) counter = 1 for aa_rec in aa_record: this_prot = 'Query_'+str(counter) annotation = rec_cogs[this_prot] # get feature details from description line # because prodigal output fails to load as valid genbank defline = aa_rec.description pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+') match = pattern.match(defline) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) l_tag = l_tag_base+"_"+str(counter) # consolidation feature annotations quals = {'note': defline, 'locus_tag': l_tag, 'fct': annotation, 'translation': aa_rec.seq} feature = SeqFeature(location=feat_loc, strand=strand_pos, id='cds_'+str(counter), type='CDS', qualifiers=quals) record.features.append(feature) counter +=1 return record
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir'] + infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir + g_name + "_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter += 1 ctg_num = str(counter) new_id = g_name + "_" + ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir + new_id + ".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter += 1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name + "_" + ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir + g_name + "_" + ctg_num + ".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format " + genome[ 'input'] + " unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir']+infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir+g_name+"_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter +=1 ctg_num = str(counter) new_id = g_name+"_"+ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir+new_id+".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter +=1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name+"_"+ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir+g_name+"_"+ctg_num+".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format "+genome['input']+" unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def batch_contig_annot(dataset): """Extract and annotate contigs.""" # identify dataset contig file contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa' # locate the COG database cog_db = dirs['blast_db_dir']+'Cog_LE/Cog' # make the training file training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn' #train_prodigal(contigs_file, training_file) # set output dirs fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/' gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/' aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/' blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/' solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/' maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/' ensure_dir(fas_out_dir) ensure_dir(gbk_out_dir) ensure_dir(aa_out_dir) ensure_dir(blast_out_dir) ensure_dir(solid_out_dir) # set phage hit collector contig_hits = {} sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_kw_hits.html' all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_all_hits.html' sp_hit_list_handle = open(sp_hit_list, 'w') all_hit_list_handle = open(all_hit_list, 'w') sp_hit_list_handle.write("<ul>") all_hit_list_handle.write("<ul>") # load all contigs contigs_list = load_multifasta(contigs_file) # cycle through contigs ctg_count = 0 gene_count = 0 for contig in contigs_list: ctg_count +=1 # use regex to acquire relevant record ID info pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)') match = pattern.match(contig.id) nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3) contig.id = nick fasta_out = fas_out_dir+nick+'.fas' # write record to file write_fasta(fasta_out, contig) # create contig entry in dict contig_hits[nick] = [] # run the annotation annot_gbk = gbk_out_dir+nick+'.gbk' annot_aa = aa_out_dir+nick+'.fas' #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file) # blast the amino acids against COG print '\tblasting', dataset['f_nick'], nick blast_out = blast_out_dir+nick+'.xml' if path.isfile(blast_out): print "\t\talready blasted" else: local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) map_file = maps_out_dir+nick+'.pdf' # consolidate annotated genbank file record = load_fasta(fasta_out) aa_defs = load_multifasta(annot_aa) features = [] counter = 1 ctg_flag_1 = 0 ctg_flag_2 = 0 for protein in aa_defs: gene_count +=1 # get feature details from description line # necessary because the prodigal output is not parser-friendly pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)') match = pattern.match(protein.description) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) annotation = rec_cogs['Query_'+str(counter)] if ctg_flag_1 is 0: all_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_1 = 1 all_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") # detect phage content in annotation phi_pattern = re.compile(r".+(COG\d+).+" "(phage|capsid|muramidase|tail|" "replication|helicase|polymerase|" "integrase|recombinase" "suppressor|hydrolase|transposase).+", re.IGNORECASE) phi_match = phi_pattern.match(annotation) if phi_match: hit_flag = 'on' hit_dict = {'CDS': counter, 'annot': annotation, 'COGs': phi_match.group} contig_hits[nick].append(hit_dict) # write out to summary file if ctg_flag_2 is 0: sp_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_2 = 1 sp_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") else: hit_flag = 'off' # consolidation feature annotations quals = {'note': protein.description, 'fct': annotation, 'flag': hit_flag} feature = SeqFeature(location=feat_loc, strand=strand_pos, id=protein.id, type='CDS', qualifiers=quals) features.append(feature) counter +=1 record.features = features record.description = dataset['f_nick']+'_contig_'+nick record.name = nick record.dbxrefs = ['Project:np1'] record.seq.alphabet = generic_dna gbk_out = solid_out_dir+nick+'.gbk' write_genbank(gbk_out, record) # generate graphical map ContigDraw(nick, gbk_out, map_file) sp_hit_list_handle.write("</ul>") all_hit_list_handle.write("</ul>") sp_hit_list_handle.close() all_hit_list_handle.close() print "\t", gene_count, "predicted genes in", ctg_count, "contigs"