def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual(len(parsed), len(indexed.keys())) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual(len(parsed), len(db_indexed.keys())) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed._proxy._handle.close() # TODO - Better solution if sqlite3 is not None: db_indexed.close() db_indexed._con.close()
def check_raw(self, filename, id, raw, **kwargs): """Index filename using keyword arguments, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) # Anticipate cases where the raw string and/or file uses different # newline characters ~ we set everything to \n. new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() # Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_raw(filename + ".bgz", id, raw, **kwargs)
def parse_search_file(input_file, mode, format="hmmer3-text", index_file=None): if mode == "index_db" or ((not isinstance(input_file, str)) and (len(input_file) > 1)): index = index_file if index_file else "tmp.idx" seq_dict = SearchIO.index_db(index, [input_file] if isinstance(input_file, str) else input_file, format=format) elif mode == "index": seq_dict = SearchIO.index(input_file if isinstance(input_file, str) else input_file[0], format=format) elif mode == "parse": seq_dict = OrderedDict() for record in SearchIO.parse(input_file if isinstance(input_file, str) else input_file[0], format=format): seq_dict[record.id] = record #seq_dict = SeqIO.to_dict(SeqIO.parse(input_file if isinstance(input_file, str) else input_file[0], format=format)) return seq_dict
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None if filename.endswith(".bgz"): handle = gzip.open(filename) parsed = list(SearchIO.parse(handle, format, **kwargs)) handle.close() else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual( len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed))) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual( len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed))) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_index(filename + ".bgz", format, **kwargs)
def check_raw(self, filename, id, raw, **kwargs): """Index filename using **kwargs, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) self.assertEqual(raw, idx.get_raw(id)) idx.close() #Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) self.assertEqual(raw, idx.get_raw(id)) idx.close() if os.path.isfile(filename + ".bgz"): #Do the tests again with the BGZF compressed file print "[BONUS %s.bgz]" % filename self.check_raw(filename + ".bgz", id, raw, **kwargs)
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None if filename.endswith(".bgz"): handle = gzip.open(filename) parsed = list(SearchIO.parse(handle, format, **kwargs)) handle.close() else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual(len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed))) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual(len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed))) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_index(filename + ".bgz", format, **kwargs)
def check_index(self, filename, format, **kwargs): if filename.endswith(".bgz"): with gzip.open(filename) as handle: parsed = list(SearchIO.parse(handle, format, **kwargs)) else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual( len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed)), ) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(":memory:", [filename], format, **kwargs) self.assertEqual( len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed)), ) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.compare_search_obj(qres, idx_qres) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.compare_search_obj(qres, dbidx_qres) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print(f"[BONUS {filename}.bgz]") self.check_index(filename + ".bgz", format, **kwargs)
if not (os.path.isfile(db_fasta + ".nhr") and \ os.path.isfile(db_fasta + ".nin") and \ os.path.isfile(db_fasta + ".nsq")): stop_err("Missing BLAST database for %s" % db_fasta) cmd = NcbiblastnCommandline(query=query_fasta, db=db_fasta, out=blast_file, outfmt=6, evalue=1e-5) print cmd stdout, stderr = cmd() return if not os.path.isfile(blast_file): do_blast(assembly_fasta, reference_fasta, blast_file) contigs = SeqIO.index(assembly_fasta, "fasta") blast_results = SearchIO.index(blast_file, "blast-tab") reference_parser = SeqIO.parse(reference_fasta, "fasta") fasta_handle = open(output_fasta, "w") fasta_saved_count = 0 fasta_short_dropped = 0 offset = 0 ref_offsets = dict() for record in reference_parser: ref_offsets[hack_ncbi_fasta_name(record.id)] = offset offset += len(record) def reverse_complement_hsp_fragment(frag, query_length): rev = SearchIO.HSPFragment(hit_id=frag.hit_id, query_id=frag.query_id)
from Bio import SearchIO result_handle = open("results/blastp_resultAD.txt") idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab') # comments=True) idss = (idx.keys()) homs = {} rec_list_all = [] for ids in idss: rec_list = [] for rec in idx[ids].hsps: rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct)) rec_list_all.append( (rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct)) homs[ids] = {ids: rec_list} leg_uniq = [] ids = [] for p in sorted(rec_list_all, key=lambda x: x[0][1]): if p[2] < 10E-3 and p[1] not in p[0]: if p[1] not in ids: ids.append(p[1]) leg_uniq.append(p) output = open('results/leg_blastp_hits.txt', 'w') output.write('id' + '\t' + 'blastp_hit' + '\t' + 'e-value' + '\t' + 'ident_pct' + '\n') for p in leg_uniq: qid = '%s' % p[0] hit = '%s' % p[1] ev = '%s' % p[2] per = '%s' % p[3]
def iterator(blast_dict): for entry in blast_dict: entry_hits = [] for hit in blast_dict[entry].hits: if hit.id not in black_list: # filter hits entry_hits.append(hit) if entry_hits: yield QueryResult(hits=entry_hits, id=entry) elif args.mode == "both": def iterator(blast_dict): for entry in blast_dict: if entry not in black_list: entry_hits = [] for hit in blast_dict[entry].hits: if hit.id not in black_list: # filter hits entry_hits.append(hit) if entry_hits: yield QueryResult(hits=entry_hits, id=entry) blast_results = SearchIO.index(args.input, args.format) SearchIO.write(iterator(blast_results), args.output, args.format) if args.output != "output": out_fd.close() if args.input != "stdin": in_fd.close()
sys_exit("Missing BLAST database for %s" % db_fasta) cmd = NcbiblastnCommandline(query=query_fasta, db=db_fasta, out=blast_file, outfmt=6, evalue=1e-5) print cmd stdout, stderr = cmd() return if not os.path.isfile(blast_file): do_blast(assembly_fasta, reference_fasta, blast_file) contigs = SeqIO.index(assembly_fasta, "fasta") blast_results = SearchIO.index(blast_file, "blast-tab") max_len = 0 for record in SeqIO.parse(reference_fasta, "fasta"): max_len += SPACER + len(record) max_len -= SPACER if os.path.isfile(reference_genbank): reference_parser = SeqIO.parse(reference_genbank, "genbank") else: reference_parser = SeqIO.parse(reference_fasta, "fasta") if output_fasta: sys.stderr.write( "WARNING - Consider using order_assembly.py instead for FASTA output\n" ) fasta_handle = open(output_fasta, "w")
filename = "substrates.txt" with open(filename) as src: p = [line.split('\r') for line in src][0] legdict = {} for line in p: pp = line.split() lid = pp[0].lower(), alias = pp[1] legdict[lid[0]] = { "lid": lid[0], "alias": alias} from Bio import SearchIO result_handle = open("results/blastp_resultAD.txt") # comments=True) idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab') idss = (idx.keys()) homs = {} rec_list_all = [] for ids in idss: rec_list = [] for rec in idx[ids].hsps: rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct)) rec_list_all.append( (rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct)) homs[ids] = {ids: rec_list} leg_uniq = [] ids = [] for p in sorted(rec_list_all, key=lambda x: x[0][1]): if p[2] < 10E-3:
### filename = "substrates.txt" with open(filename) as src: p = [line.split('\r') for line in src][0] legdict = {} for line in p: pp = line.split() lid = pp[0].lower(), alias = pp[1] legdict[lid[0]] = {"lid": lid[0], "alias": alias} from Bio import SearchIO result_handle = open("results/blastp_resultAD.txt") # comments=True) idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab') idss = (idx.keys()) homs = {} rec_list_all = [] for ids in idss: rec_list = [] for rec in idx[ids].hsps: rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct)) rec_list_all.append( (rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct)) homs[ids] = {ids: rec_list} leg_uniq = [] ids = [] for p in sorted(rec_list_all, key=lambda x: x[0][1]): if p[2] < 10E-3:
def parallel_BLAST(list_of_genes, seqindex, split_by, out): """ Run a BLASTp all-vs.-all search on a subset of genes from a database. Splits queries into separate files by a divisor split_by, BLASTs them simultaneously and then concatenates them using cat. Currently uses subprocess over BioPython's BLASTp wrapper for easy parallelization using mp.Pool and subprocess_BLAST. Returns a SearchIO BLAST tabular object. Requires cat, makeblastdb and blastp in your $PATH! Arguments: list_of_genes = List of (remaining) noncore protein IDs. seqindex = SeqIO.index of all proteins. split_by = Divisor to split files into. out = File prefix for results files given current cluster size being investigated. """ ##### Generate FASTA database of (remaining) noncore proteins. ##### outfast = open(out, "w") for seq in list_of_genes: outfast.write(">{0}\n{1}\n".format(seqindex[seq].id, seqindex[seq].seq)) outfast.close() # Close file here, makeblastdb has problems otherwise. subblastlog.write("Wrote {0} sequences to {1}...\n".format( len(list_of_genes), out)) ##### Run makeblastdb on FASTA database. ##### sp.call([ "makeblastdb", "-in", out, "-dbtype", "prot", "-out", "{0}.db".format(out) ], stdout=subblastlog) ##### Split FASTA database by split_by and generate list of BLASTp commands. ##### count = 0 query_cmds = [] # Handle for simultaneous BLASTp commands. to_split = SeqIO.parse("{0}".format(out), "fasta") for part in grouper(to_split, int(round(len(list_of_genes) / split_by))): count = count + 1 seqs = filter(lambda x: x is not None, part) # Remove fill values. if glob("{0}.part{1}.faa".format( out, count)): # Remove previous versions of file if present. os.remove("{0}.part{1}.faa".format(out, count)) SeqIO.write(seqs, "{0}.part{1}.faa".format(out, count), "fasta") query_cmds.append([ "blastp", "-query", "{0}.part{1}.faa".format(out, count), "-db", "{0}.db".format(out), "-outfmt", "6 std qlen slen", "-evalue", "0.0001", "-out", "{0}.part{1}.subblast".format(out, count), "-num_threads", "1" ]) subblastlog.write( "Split original query file {0} ({1} sequences) into {2} files.\n". format(out, len(list_of_genes), str(split_by))) ##### Run BLASTp processes simultaneously using mp.Pool. ##### farm = mp.Pool(processes=split_by) farm.map(subprocess_BLAST, query_cmds) farm.close() subblastlog.write( "Finished BLAST+ searches for {0} ({1} sequences), split into {2} files.\n" .format(out, len(list_of_genes), str(split_by))) ##### Concatenate parallel_BLAST results together in the shell and remove other files. ##### sp.call(["cat"] + glob("*subblast"), stdout=open("{0}.results".format(out), "wb")) for bin_file in glob("%s.db*" % out): os.remove(bin_file) for part_file in glob("%s.part*" % out): os.remove(part_file) os.remove("%s" % out) subblastlog.write( "Concatenated BLAST+ output for {0} ({1} sequences).\n".format( out, len(list_of_genes), str(split_by))) ##### Parse parallel_BLAST results and return them as a SearchIO.index instance to cluster_clean. ##### blast = SearchIO.index("{0}.results".format(out), "blast-tab", fields=blast_fields) subblastlog.write( "Loaded {0} as SearchIO.index for cluster_clean.\n".format(out)) return blast
def cluster_clean(panoct_clusters, fasta_handle, split_by=4, min_id_cutoff=30, strain_cutoff=1.0, iterations=1): """ Tidy up non-core clusters found by PanOCT. Feeds into parallel_BLAST, which expects you to have BLAST+ installed. Also feeds into gap_finder, which doesn't require anything else. """ ##### Load in FASTA database and PanOCT results. ##### db = SeqIO.index(fasta_handle, "fasta") full_blast = SearchIO.index("blast_results.txt", "blast-tab") matchtable = reader(open(panoct_clusters), delimiter="\t") ##### Initialize empty dictionaries for cluster types. ##### core = {} noncore = {} softcore = {} ##### Initialize variables for total/starting number of genomes. ##### total = 0 start = 0 ##### Populate core & noncore dictionaries by reading PanOCT results. ##### for row in matchtable: if "----------" in row: noncore[row[0]] = row[1:] # Populating our initial noncore dict. else: core[row[0]] = row[1:] # Populating our core dict. if total == 0: total = len(row) - 1 # Total number of genomes. print total start = total - 1 # Max noncore cluster size. mainlogfile.write( "{0} core clusters and {1} noncore clusters identified...\n".format( len(core), len(noncore))) #### Run parallel_BLAST and gap finding for n iterations. ##### for iteration in range(0, iterations, 1): mainlogfile.write("Running iteration {0}...\n".format(iteration + 1)) ##### Loop through noncore clusters from size (total -1) to 2. ##### for size in range(start, 0, -1): filled_count = 0 merged_count = 0 ##### Get list of (remaining) noncore protein IDs. ##### to_blast = filter(lambda x: x != "----------", flatten([noncore[key] for key in noncore])) mainlogfile.write("All-vs.-all BLAST of {0} proteins...\n".format( len(to_blast))) ##### Run parallel_BLAST. ##### results = parallel_BLAST( to_blast, db, split_by, "ClusterBLAST_{0}.fasta".format(str(size))) outfast = open("ClusterBLAST_{0}.fasta".format(str(size)), "w") for seq in to_blast: outfast.write(">{0}\n{1}\n".format(db[seq].id, db[seq].seq)) #results = SearchIO.index("ClusterBLAST_{0}.fasta.results".format(str(size)), "blast-tab", # fields=blast_fields) mainlogfile.write( "Finding potential homology gaps in clusters of size {0}...\n". format(str(size))) ##### Run gap_finder. ##### gaps = gap_finder(results, db, noncore, total, size, min_id_cutoff, strain_cutoff) ##### Identify clusters that need to be merged and move merged clusters to appropriate dictionary. ##### for cluster in gaps: if cluster in noncore: cluster_strains = [ i.split("|")[0] for i in filter( lambda x: x != "----------", noncore[cluster]) ] for candidate in gaps[cluster]: if candidate in noncore: candidate_strains = [ i.split("|")[0] for i in filter(lambda x: x != "----------", noncore[candidate]) ] if len( set(candidate_strains) & set(cluster_strains)) == 0: merge_size = ( len( filter(lambda x: x != "----------", noncore[cluster])) + len( filter(lambda x: x != "----------", noncore[candidate]))) if merge_size == total: mainlogfile.write( "{0} (size: {1}) has a homologous cluster: {2} (size: {3})\n" .format( cluster, len( filter( lambda x: x != "----------", noncore[cluster])), candidate, len( filter( lambda x: x != "----------", noncore[candidate])))) mainlogfile.write( "Merging smaller cluster {0} into larger cluster {1}...\n" .format(candidate, cluster)) mainlogfile.write( "Merged cluster {0} has size {1}.\n". format(cluster, merge_size)) filled = merge_clusters( noncore[cluster], noncore[candidate]) softcore[cluster] = filled del noncore[cluster], noncore[candidate] filled_count = filled_count + 2 elif merge_size < total: mainlogfile.write( "{0} (size: {1}) has a homologous cluster: {2} (size: {3})\n" .format( cluster, len( filter( lambda x: x != "----------", noncore[cluster])), candidate, len( filter( lambda x: x != "----------", noncore[candidate])))) mainlogfile.write( "Merging smaller cluster {0} into larger cluster {1}...\n" .format(candidate, cluster)) mainlogfile.write( "Merged cluster {0} has size {1}.\n". format(cluster, merge_size)) merged = merge_clusters( noncore[cluster], noncore[candidate]) noncore[cluster] = merged del noncore[candidate] merged_count = merged_count + 2 mainlogfile.write( "At cluster size (n = {0}): merged {1} homologous clusters into {2} softcore clusters.\n" .format(size, filled_count, filled_count / 2)) mainlogfile.write( "At cluster size (n = {0}): merged {1} homologous clusters into {2} noncore clusters.\n" .format(size, merged_count, merged_count / 2)) if not os.path.isdir("{0}/sub_BLASTs".format(os.getcwd())): os.makedirs("{0}/sub_BLASTs/faa".format(os.getcwd())) os.makedirs("{0}/sub_BLASTs/results".format(os.getcwd())) for sub_faa in glob("ClusterBLAST_*.fasta"): os.rename( sub_faa, "{0}/sub_BLASTs/faa/{1}".format(os.getcwd(), sub_faa)) for sub_results in glob("*.results"): os.rename( sub_results, "{0}/sub_BLASTs/results/{1}".format( os.getcwd(), sub_results)) with open("new_matchtable.txt", "w") as outmatch: for cluster in core: outmatch.write("{0}\t{1}\n".format(cluster, "\t".join(core[cluster]))) for cluster in softcore: outmatch.write("{0}\t{1}\n".format(cluster, "\t".join(softcore[cluster]))) for cluster in noncore: outmatch.write("{0}\t{1}\n".format(cluster, "\t".join(noncore[cluster]))) with open("new_softtable.txt", "w") as outsofmatch: for cluster in softcore: outsofmatch.write("{0}\t{1}\n".format(cluster, "\t".join( softcore[cluster]))) with open("new_nontable.txt", "w") as outnonmatch: for cluster in noncore: outnonmatch.write("{0}\t{1}\n".format(cluster, "\t".join(noncore[cluster]))) with open("softcore_pam.txt", "w") as outsof: for cluster in softcore: pa = [] for el in softcore[cluster]: if el == "----------": pa.append("0") else: pa.append("1") outsof.write("{0}\n".format("\t".join(pa))) with open("noncore_pam.txt", "w") as outnon: for cluster in noncore: pa = [] for el in noncore[cluster]: if el == "----------": pa.append("0") else: pa.append("1") outnon.write("{0}\n".format("\t".join(pa))) sizes_arg = [] counts_arg = [] n_sizes = Counter([ len(filter(lambda x: x != "----------", noncore[cluster])) for cluster in noncore ]) for n_size in n_sizes: if int(n_size) < 10: sizes_arg.append("n0" + str(n_size)) else: sizes_arg.append("n" + str(n_size)) counts_arg.append(str(n_sizes[n_size] * int(n_size))) core_count = len(flatten(core.values())) + len( filter(lambda x: x != "----------", flatten(softcore.values()))) sizes_arg.append("n" + str(total)) counts_arg.append(str(core_count)) core_proteome = len(flatten(core.values())) softcore_proteome = len( filter(lambda x: x != "----------", flatten(softcore.values()))) noncore_proteome = len( filter(lambda x: x != "----------", flatten(noncore.values()))) mainlogfile.write("====Core: {0} clusters, {1} proteins." "Softcore: {2} clusters, {3} proteins." "Accessory: {4} clusters, {5} proteins.\n====".format( core.keys(), core_proteome, softcore.keys(), softcore_proteome, noncore.keys(), noncore_proteome)) ring_plot = [ "Rscript", "{0}/PlotRingChart.R".format(dirname), str(core_proteome), str(softcore_proteome), str(noncore_proteome), ",".join(size for size in sizes_arg), ",".join(count for count in counts_arg) ] try: sp.check_call(ring_plot) mainlogfile.write("Creating ring chart in R...\n") except sp.CalledProcessError as r_exec: if r_exec.returncode != 0: mainlogfile.write( "Unable to run R script PlotRingChart.R, attempted command below:\n" ) mainlogfile.write(" ".join(ring_plot) + "\n") upset_plot = [ "Rscript", "PlotUsingUpSet.R", "softcore_pam.txt", "softcore_upset.eps" ] try: sp.check_call(upset_plot) mainlogfile.write("Creating upset plot of softcore clusters in R...\n") upset_plot = [ "Rscript", "PlotUsingUpSet.R", "noncore_pam.txt", "noncore_upset.eps" ] try: sp.check_call(upset_plot) except sp.CalledProcessError as r_exec: if r_exec.returncode != 0: mainlogfile.write( "Unable to run R script PlotUsingUpSet.R. Run command manually:\n" ) mainlogfile.write(" ".join(upset_plot) + "\n") except sp.CalledProcessError as r_exec: if r_exec.returncode != 0: mainlogfile.write( "Unable to run R script PlotUsingUpSet.R. Run command manually:\n" ) mainlogfile.write(" ".join(upset_plot) + "\n") mainlogfile.write( "Remaining noncore clusters after prediction analysis: {0}\n".format( len(noncore))) mainlogfile.write( "prediction analysis finished in {0} seconds. Thank you for choosing prediction, the friendly pangenome software.\n" .format(time.time() - start_time)) mainlogfile.write("=== Finished prediction job at {0}. ===\n".format( str(datetime.datetime.now())))
from Bio import SearchIO result_handle = open("results/blastp_resultAD.txt") idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab')# comments=True) idss = (idx.keys()) homs = {} rec_list_all = [] for ids in idss: rec_list = [] for rec in idx[ids].hsps: rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct)) rec_list_all.append((rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct)) homs[ids] = {ids: rec_list} leg_uniq = [] ids = [] for p in sorted(rec_list_all, key=lambda x: x[0][1]): if p[2] < 10E-3 and p[1] not in p[0]: if p[1] not in ids: ids.append(p[1]) leg_uniq.append(p) output = open('results/leg_blastp_hits.txt', 'w') output.write('id'+'\t'+'blastp_hit'+ '\t' + 'e-value' + '\t' + 'ident_pct' + '\n') for p in leg_uniq: qid = '%s' % p[0] hit = '%s' % p[1] ev = '%s' % p[2] per = '%s' % p[3] output.write(qid + "\t" + hit + "\t" + ev + '\t' + per + "\n") output.close()
def FillGaps(blast, matchtable, seqs, tags): """ Try to fill in gaps in syntenic clusters that might have arisen via genomic events and/or assembly artefacts. """ # Load core and accessory cluster sets, BLAST+ data and sequence data. core, acc = ParseMatchtable(matchtable) new_clusters = {} if acc: searches = SearchIO.index(blast, "blast-tab") tags = [line.strip("\n") for line in open(tags)] # Loop over every accessory cluster. og_acc = acc.keys() ignore = [] for q_cluster_id in og_acc: print "{0} out of {1} clusters searched".format( og_acc.index(q_cluster_id), len(og_acc)) if q_cluster_id not in ignore: current_acc = [key for key in acc.keys() if key not in ignore] if q_cluster_id in current_acc: q_cluster = acc[q_cluster_id] q_pos = [pos for pos, gene in enumerate(q_cluster) if gene] q_present = set([tags[pos] for pos in q_pos]) q_members = set( sorted(filter(lambda x: x is not None, q_cluster))) q_missing = set( filter(lambda tag: tag not in q_present, tags)) q_blasts = QueryClusterFirstHits(q_cluster, searches, 30, q_missing) q_first_hits = set( filter(lambda x: x is not None, Flatten(q_blasts.values()))) q_query = MultipleInsert(list(q_first_hits), tags) if q_query in acc.values(): s_cluster_id = acc.keys()[acc.values().index(q_query)] if s_cluster_id not in ignore: s_cluster = acc[s_cluster_id] s_members = set( sorted( filter(lambda x: x is not None, s_cluster))) if s_members == q_first_hits: s_present = set( [gene.split("|")[0] for gene in s_members]) s_missing = set( filter(lambda tag: tag not in s_present, tags)) s_blasts = QueryClusterFirstHits( s_cluster, searches, 30, s_missing) s_first_hits = set( filter(lambda x: x is not None, Flatten(s_blasts.values()))) reciprocal = Reciprocal( q_members, q_first_hits, s_members, s_first_hits) if reciprocal: new_cluster = ClusterMerge( q_cluster, s_cluster) new_clusters[q_cluster_id] = new_cluster acc.pop(q_cluster_id, "None") acc.pop(s_cluster_id, "None") print "clusters merged: {0} {1}\n".format( str(q_cluster_id), str(s_cluster_id)) print "size of clusters merged: {0} {1}\n".format( len(q_members), len(s_members)) ignore = ignore + [ q_cluster_id, s_cluster_id ] else: pass else: pass # Write new matchtable to file. with open("refined_matchtable.txt", "w") as out: if acc: dataset = [core, acc, new_clusters] else: dataset = [core] for comp in dataset: for cluster in comp: line = [ "----------" if not a else str(a) for a in comp[cluster] ] out.write("\t".join(line) + "\n")