def gbk2fas(gbk_file): """Convert a Genban file to kFastA format.""" record = load_genbank(gbk_file) fas_file = gbk_file[:gbk_file.find('.gbk')]+'.fas' # record.name = rec_name # record.id = rec_name write_fasta(fas_file, record) return fas_file
def iter_align(coord_array, ref_rec, query_rec, aln_dir, segs_file): """Iterate through array of coordinates to make pairwise alignments.""" # set up the root subdirectories seqs = aln_dir + "input_seqs/" alns = aln_dir + "output_alns/" ensure_dir([seqs, alns]) aln_id = 0 aln_len = 0 # cycle through segments for segment_pair in coord_array: print ".", xa, xb, xc, xd = segment_pair # extract the corresponding sequence slices ref_seq = ref_rec[abs(xa):abs(xb)] query_seq = query_rec[abs(xc):abs(xd)] # reverse-complement sequences with negative sign if xa < 0: ref_seq = ref_seq.reverse_complement() if xc < 0: query_seq = query_seq.reverse_complement() # write sequences to file mscl_in = seqs + str(xa) + "_" + str(xb) + "_" + str(xc) + "_" + str( xd) + ".fas" write_fasta(mscl_in, [ref_seq, query_seq]) # skip segments that are too small to align if abs(abs(xa) - abs(xb)) < 10: idp = 0 else: # set up outfiles mscl_out = alns + str(xa) + "_" + str(xb) + "_" + str( xc) + "_" + str(xd) + ".aln" logfile = aln_dir + "muscle_log.txt" # perform alignment align_muscle(mscl_in, mscl_out, logfile) idntot = parse_clustal_idstars(mscl_out) idp = int((float(idntot) / len(query_seq)) * 100) aln_id += idntot aln_len += len(query_seq) # write details out to segments file line = "\t".join([str(xa), str(xb), str(xc), str(xd), str(idp) + "\n"]) open(segs_file, 'a').write(line) overall_id = int((float(aln_id) / aln_len) * 100) print "" return overall_id
def iter_align(coord_array, ref_rec, query_rec, aln_dir, segs_file): """Iterate through array of coordinates to make pairwise alignments.""" # set up the root subdirectories seqs = aln_dir+"input_seqs/" alns = aln_dir+"output_alns/" ensure_dir([seqs, alns]) aln_id = 0 aln_len = 0 # cycle through segments for segment_pair in coord_array: print ".", xa, xb, xc, xd = segment_pair # extract the corresponding sequence slices ref_seq = ref_rec[abs(xa):abs(xb)] query_seq = query_rec[abs(xc):abs(xd)] # reverse-complement sequences with negative sign if xa < 0 : ref_seq = ref_seq.reverse_complement() if xc < 0 : query_seq = query_seq.reverse_complement() # write sequences to file mscl_in = seqs+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".fas" write_fasta(mscl_in, [ref_seq, query_seq]) # skip segments that are too small to align if abs(abs(xa)-abs(xb)) < 10: idp = 0 else: # set up outfiles mscl_out = alns+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".aln" logfile = aln_dir+"muscle_log.txt" # perform alignment align_muscle(mscl_in, mscl_out, logfile) idntot = parse_clustal_idstars(mscl_out) idp = int((float(idntot)/len(query_seq))*100) aln_id += idntot aln_len += len(query_seq) # write details out to segments file line = "\t".join([str(xa), str(xb), str(xc), str(xd), str(idp)+"\n"]) open(segs_file, 'a').write(line) overall_id = int((float(aln_id)/aln_len)*100) print "" return overall_id
def extract_segs_seqs(self, record, out_dir): count = 0 for seg in self.segs: # unpack segment coords seg_start, seg_stop = seg['coords'][0], seg['coords'][1] # extract segment sequence segment = record[seg_start:seg_stop] if seg['strand'] < 0: segment = segment.reverse_complement() segment.id = self.name + "_" + seg['name'] # write to individual file out_file = out_dir + self.name + "_" + seg['name'] + ".fas" write_fasta(out_file, segment) # record segment feature feat_loc = FeatureLocation(seg_start, seg_stop) feature = SeqFeature(location=feat_loc, type='ref_seg', qualifiers={'id': seg['name']}) record.features.append(feature) count += 1 return record
def extract_segs_seqs(self, record, out_dir): count = 0 for seg in self.segs: # unpack segment coords seg_start, seg_stop = seg['coords'][0], seg['coords'][1] # extract segment sequence segment = record[seg_start:seg_stop] if seg['strand'] < 0: segment = segment.reverse_complement() segment.id = self.name+"_"+seg['name'] # write to individual file out_file = out_dir+self.name+"_"+seg['name']+".fas" write_fasta(out_file, segment) # record segment feature feat_loc = FeatureLocation(seg_start, seg_stop) feature = SeqFeature(location=feat_loc, type='ref_seg', qualifiers={'id': seg['name']}) record.features.append(feature) count +=1 return record
def basic_batch_blast(genomes, run_ref, blast_mode, r_root_dir, run_dirs, fixed_dirs, blast_prefs, run_id, timestamp): """Send batch jobs to Blast. Muxes to multiple reference DBs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" in_root = run_root+run_dirs['ref_seg_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Blast segs to genomes @", timestamp, "\n\n"]) run_ref.log(logstring) # do blast for seg in run_ref.segs: input_file = in_root+ref_n+"_"+seg['name']+".fas" # translate if required if blast_mode == 'tn': record = load_fasta(input_file) record.seq = record.seq.translate() input_file = in_root+ref_n+"_"+seg['name']+"_aa.fas" # substitute write_fasta(input_file, record) out_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg['name']+"/" ensure_dir([out_dir]) print "\t", seg['name'], for genome in genomes: g_name = genome['name'] db_path = fixed_dirs['blast_db_dir']+g_name outfile = out_dir+g_name+"_out.txt" print ".", if blast_mode == 'n': local_blastn_2file(input_file, db_path, outfile, blast_prefs) elif blast_mode == 'tx': local_tblastx_2file(input_file, db_path, outfile, blast_prefs) elif blast_mode == 'tn': local_tblastn_2file(input_file, db_path, outfile, blast_prefs) print "" run_ref.log("All OK") return "OK"
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/" mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root + g_name + "/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root + g_name + "/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas" scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir + ctg_num + ".mauve" bb_file = mauve_file + ".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert( anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir + g_name + "_" + str( ctg_num) + ".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name + " scaffold from " + ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir'] + infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir + g_name + "_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter += 1 ctg_num = str(counter) new_id = g_name + "_" + ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir + new_id + ".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter += 1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name + "_" + ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir + g_name + "_" + ctg_num + ".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format " + genome[ 'input'] + " unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def batch_contig_annot(dataset): """Extract and annotate contigs.""" # identify dataset contig file contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa' # locate the COG database cog_db = dirs['blast_db_dir']+'Cog_LE/Cog' # make the training file training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn' #train_prodigal(contigs_file, training_file) # set output dirs fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/' gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/' aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/' blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/' solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/' maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/' ensure_dir(fas_out_dir) ensure_dir(gbk_out_dir) ensure_dir(aa_out_dir) ensure_dir(blast_out_dir) ensure_dir(solid_out_dir) # set phage hit collector contig_hits = {} sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_kw_hits.html' all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_all_hits.html' sp_hit_list_handle = open(sp_hit_list, 'w') all_hit_list_handle = open(all_hit_list, 'w') sp_hit_list_handle.write("<ul>") all_hit_list_handle.write("<ul>") # load all contigs contigs_list = load_multifasta(contigs_file) # cycle through contigs ctg_count = 0 gene_count = 0 for contig in contigs_list: ctg_count +=1 # use regex to acquire relevant record ID info pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)') match = pattern.match(contig.id) nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3) contig.id = nick fasta_out = fas_out_dir+nick+'.fas' # write record to file write_fasta(fasta_out, contig) # create contig entry in dict contig_hits[nick] = [] # run the annotation annot_gbk = gbk_out_dir+nick+'.gbk' annot_aa = aa_out_dir+nick+'.fas' #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file) # blast the amino acids against COG print '\tblasting', dataset['f_nick'], nick blast_out = blast_out_dir+nick+'.xml' if path.isfile(blast_out): print "\t\talready blasted" else: local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) map_file = maps_out_dir+nick+'.pdf' # consolidate annotated genbank file record = load_fasta(fasta_out) aa_defs = load_multifasta(annot_aa) features = [] counter = 1 ctg_flag_1 = 0 ctg_flag_2 = 0 for protein in aa_defs: gene_count +=1 # get feature details from description line # necessary because the prodigal output is not parser-friendly pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)') match = pattern.match(protein.description) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) annotation = rec_cogs['Query_'+str(counter)] if ctg_flag_1 is 0: all_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_1 = 1 all_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") # detect phage content in annotation phi_pattern = re.compile(r".+(COG\d+).+" "(phage|capsid|muramidase|tail|" "replication|helicase|polymerase|" "integrase|recombinase" "suppressor|hydrolase|transposase).+", re.IGNORECASE) phi_match = phi_pattern.match(annotation) if phi_match: hit_flag = 'on' hit_dict = {'CDS': counter, 'annot': annotation, 'COGs': phi_match.group} contig_hits[nick].append(hit_dict) # write out to summary file if ctg_flag_2 is 0: sp_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_2 = 1 sp_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") else: hit_flag = 'off' # consolidation feature annotations quals = {'note': protein.description, 'fct': annotation, 'flag': hit_flag} feature = SeqFeature(location=feat_loc, strand=strand_pos, id=protein.id, type='CDS', qualifiers=quals) features.append(feature) counter +=1 record.features = features record.description = dataset['f_nick']+'_contig_'+nick record.name = nick record.dbxrefs = ['Project:np1'] record.seq.alphabet = generic_dna gbk_out = solid_out_dir+nick+'.gbk' write_genbank(gbk_out, record) # generate graphical map ContigDraw(nick, gbk_out, map_file) sp_hit_list_handle.write("</ul>") all_hit_list_handle.write("</ul>") sp_hit_list_handle.close() all_hit_list_handle.close() print "\t", gene_count, "predicted genes in", ctg_count, "contigs"
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir + run_id + "/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir'] + ref['file'] seg_out_root = run_root + run_dirs['ref_seg_dir'] + ref_name + "/" gen_fas_root = fixed_dirs['fas_contigs_dir'] + ref_name + "/" if ref_annot_flag: ref_gbk = run_root + run_dirs[ 'ref_gbk_dir'] + ref_name + "_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root + run_dirs['ref_fas_dir'] + ref_name + ".fas" genome_fas = gen_fas_root + ref_name + "_1.fas" report_root = run_root + run_dirs['reports'] + ref_name + "/" ref_log = report_root + run_id + "_" + ref_name + "_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for " + ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir']+infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir+g_name+"_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter +=1 ctg_num = str(counter) new_id = g_name+"_"+ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir+new_id+".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter +=1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name+"_"+ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir+g_name+"_"+ctg_num+".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format "+genome['input']+" unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir+run_id+"/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir']+ref['file'] seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/" gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/" if ref_annot_flag: ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas" genome_fas = gen_fas_root+ref_name+"_1.fas" report_root = run_root+run_dirs['reports']+ref_name+"/" ref_log = report_root+run_id+"_"+ref_name+"_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for "+ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
if __name__ == "__main__": blast_results = parse_blast_output(BLAST_OUTPUT) id_to_sequence = parse_fasta(PDB_SEQS_STRUCTURE) ref_to_sequence = parse_fasta(REFERENCE_PROTEOME) ordered_blast_results = order_hits(blast_results) combined_blast_results = combine_all_hits(ordered_blast_results) best_hits = get_best_hits(combined_blast_results) for query, best_hit in best_hits.items(): fasta_dict = {} ref_id = best_hit[1] fasta_dict[query] = id_to_sequence[query] fasta_dict[ref_id] = ref_to_sequence[ref_id] temp_fasta = f'{TEMP}{query}.fasta' write_fasta(fasta_dict, temp_fasta) temp_aligned = f'{TEMP}{query}_aligned.fasta' # if not os.path.isfile(temp_aligned): run_muscle(temp_fasta, temp_aligned) aligned_fasta_dict = parse_fasta(temp_aligned) mapping = make_mapping(aligned_fasta_dict) mapping_out = f'{MAPPING_DIR}{query}_{ref_id}.txt' write_residue_mapping(mapping, mapping_out) # print(query, best_hit[1], best_hit[0].qcov, best_hit[0].scov, best_hit[0].worst_eval, best_hit[0].smallest_ident)
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root+g_name+"/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas" scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "("+ctg_num+")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "("+ctg_num+")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir+ctg_num+".mauve" bb_file = mauve_file+".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert(anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name+" scaffold from "+ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" match_root = run_root + run_dirs['match_out_dir'] + ref_n + "/" capture_root = run_root + run_dirs['capture_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root + run_dirs[ 'blast_out_dir'] + ref_n + "/" + seg_n + "/" capture_dir = capture_root + "/" + seg_n + "/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root + g_name + "/" ensure_dir([matches_dir]) blast_infile = blast_dir + g_name + "_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir + g_name + "_1.fas", matches_dir + g_name + ".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop - q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length > min_match and score > min_score and idp > min_idp: print "+", p_cnt += 1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'(' + contig_id + ')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir + match.group( 1) + ".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir + item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir + contig_id + ".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start - capture_span c_stop = q_stop + capture_span else: c_start = q_stop - capture_span c_stop = q_start + capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir + g_name + "_" + contig_id + ".fas" cxt_rec = SeqRecord( id=contig_id + "_" + str(c_start) + "_" + str(c_stop), seq=contig_rec.seq[c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt += 1 if n_cnt > 0: logstring = "".join( ["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" match_root = run_root+run_dirs['match_out_dir']+ref_n+"/" capture_root = run_root+run_dirs['capture_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/" capture_dir = capture_root+"/"+seg_n+"/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root+g_name+"/" ensure_dir([matches_dir]) blast_infile = blast_dir+g_name+"_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir+g_name+"_1.fas", matches_dir+g_name+".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop-q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length>min_match and score>min_score and idp>min_idp: print "+", p_cnt +=1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'('+contig_id+')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir+match.group(1)+".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir+item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir+contig_id+".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start-capture_span c_stop = q_stop+capture_span else: c_start = q_stop-capture_span c_stop = q_start+capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir+g_name+"_"+contig_id+".fas" cxt_rec = SeqRecord(id=contig_id+"_" +str(c_start)+"_" +str(c_stop), seq=contig_rec.seq [c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt +=1 if n_cnt > 0: logstring = "".join(["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
#!/usr/bin/env python from sys import argv from parsers import parse_fasta, LongFastaID from writers import write_fasta if __name__ == "__main__": fasta = argv[1] out_file = argv[2] fasta_dict = parse_fasta(fasta) prot_id_to_seq = {} for fasta_id, sequence in fasta_dict.items(): fasta_id = LongFastaID(fasta_id) prot_id = fasta_id.protein_id assert prot_id not in prot_id_to_seq prot_id_to_seq[prot_id] = sequence write_fasta(prot_id_to_seq, out_file)
from parsers import parse_fasta, LongFastaID from writers import write_fasta from sys import argv REFACC = "NC_045512.2" def separate_ref_from_nonref(fasta_dir): ref_fasta_dict = {} nonref_fasta_dict = {} fasta_dict = parse_fasta(fasta_dir) for seq_id, sequence in fasta_dict.items(): seq_id = LongFastaID(seq_id) if seq_id.genome_acc == REFACC: ref_fasta_dict[seq_id.protein_id] = sequence else: nonref_fasta_dict[seq_id.protein_id] = sequence return ref_fasta_dict, nonref_fasta_dict if __name__ == "__main__": fasta_dir = argv[1] ref_fasta_dict, nonref_fasta_dict = separate_ref_from_nonref(fasta_dir) write_fasta(ref_fasta_dict, 'reference_proteome.fasta') write_fasta(nonref_fasta_dict, 'non-reference_covid19_proteins.fasta')
if __name__ == "__main__": blast_output = argv[1] covid19_fasta = argv[2] orf_mapping = argv[3] covid19_fasta_dict = parse_fasta(covid19_fasta) orf_mapping = parse_mapping(orf_mapping) queryid_to_hits = parse_blast_output(blast_output) sorted_hit_dict = order_hits(queryid_to_hits) combined_hit_dicts = {} for query, subject_to_hits in sorted_hit_dict.items(): combined_hit_dict = combine_hits(subject_to_hits) combined_hit_dicts[query] = combined_hit_dict filtered_hits, identical_hits, rejected_hits = filter_hits(combined_hit_dicts, covid19_fasta_dict) # print(identical_hits.items()) subject_to_queries = map_subject_to_queries(filtered_hits) fasta_dicts = make_fasta_dicts(subject_to_queries, covid19_fasta_dict, orf_mapping) for protein_name, fasta_dict in fasta_dicts.items(): file_name = protein_name + '.fasta' seq_to_id = make_seq_to_id_dict(fasta_dict, identical_hits, covid19_fasta_dict) id_to_seq = make_new_fasta_dict(seq_to_id) write_fasta(id_to_seq, file_name)
from writers import write_fasta from sys import argv def get_refseqs(refseq_to_uniprot): refseqs = set([]) for refseq in refseq_to_uniprot: refseqs.add(refseq.split('.')[0]) return refseqs if __name__ == "__main__": fasta = argv[1] refseqs = argv[2] refseq_to_uniprot = parse_mapping(refseqs) refseqs = get_refseqs(refseq_to_uniprot) fasta_dict = parse_fasta(fasta) refseq_to_seq = {} for fasta_id, sequence in fasta_dict.items(): fasta_id = fasta_id.split('|')[0] print(fasta_id) fasta_id = fasta_id.strip() if fasta_id in refseqs: refseq_to_seq[fasta_id] = sequence write_fasta(refseq_to_seq, 'reference_proteome_complete.fasta')
sequence_to_id = {} for fasta_id, sequence in fasta_dict.items(): fasta_id = parse_fasta_id(fasta_id) if not sequence in sequence_to_id: sequence_to_id[sequence] = [] sequence_to_id[sequence].append(fasta_id) return sequence_to_id def assign_code(sequence_to_id): code_to_sequence = {} code_to_accession = {} for i, (sequence, accessions) in enumerate(sequence_to_id.items()): code = 'seq_%.4d' % i code_to_sequence[code] = sequence code_to_accession[code] = accessions return code_to_sequence, code_to_accession if __name__ == "__main__": fasta = argv[1] id_to_sequence = parse_fasta(fasta) sequence_to_id = reverse_fasta_dict(id_to_sequence) code_to_sequence, code_to_accession = assign_code(sequence_to_id) write_fasta(code_to_sequence, UNIQUE_SEQ_DIR) write_code_to_accession(code_to_accession, CODE_DIR)