def main(): p = argparse.ArgumentParser() p.add_argument('lca_db') p.add_argument('genome', nargs='+') p.add_argument('output') p.add_argument('--fragment', default=100000, type=int) args = p.parse_args() db, ksize, scaled = lca_utils.load_single_database(args.lca_db) mh_factory = sourmash.MinHash(n=0, ksize=ksize, scaled=scaled) print('**', ksize, scaled) n = 0 m = 0 sum_bp = 0 sum_missed_bp = 0 outfp = open(args.output, 'wt') w = csv.writer(outfp) w.writerow(['filename', 'contig', 'begin', 'end', 'lca', 'lca_rank']) # # iterate over all contigs in genome file # for genome in args.genome: for record in screed.open(genome): # fragment longer contigs into smaller regions? for start in range(0, len(record.sequence), args.fragment): seq = record.sequence[start:start + args.fragment] n += 1 sum_bp += len(seq) mh = mh_factory.copy_and_clear() mh.add_sequence(seq, force=True) if not mh: sum_missed_bp += len(seq) continue lineage_counts = summarize(mh.get_mins(), [db], 1) for k in lineage_counts: lca = lca_utils.display_lineage(k, truncate_empty=False) try: lca_rank = k[-1].rank except IndexError: lca_rank = "none" w.writerow((genome, record.name, start, start + args.fragment, lca, lca_rank)) m += 1 min_value = min(mh.get_mins()) return 0
def write_result(self, result, name, length, result_type="search"): # write single result d = dict(result._asdict()) d["name"] = name d["length"] = length d["lineage"] = lca_utils.display_lineage(result.lineage) if self.search and result_type == "search": self.search_sigs.append(d['match']) del d['match'] self.search_w.writerow(d) elif self.search and result_type == "ranksearch": self.ranksearch_sigs.append(d['match']) del d['match'] d["match_rank"] = result.lineage[-1].rank self.rank_w.writerow(d) elif self.gather and result_type == "rankgather": d["match_rank"] = result.lineage[-1].rank #d["major_bp"] = get_match_bp(float(gr.f_major)) self.gather_rank_w.writerow(d)
def main(args): # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.ksize, args.scaled) # count all the shared kmers across these databases counts, total_kmer_count = count_shared_kmers(dblist) # write out with open(args.csv, 'wt') as fp: hashes_by_lineage = csv.writer(fp) hashes_by_lineage.writerow( ['rank', 'lineage', 'num_shared_kmers', 'percent_shared_kmers']) for lineage, shared_kmer_count in counts.items(): rank = lineage[-1].rank percent_shared_kmers = float(shared_kmer_count) / total_kmer_count hashes_by_lineage.writerow([ rank, lca_utils.display_lineage(lineage), str(shared_kmer_count), str(round(percent_shared_kmers, 2)) ])
def make_lca_counts(dblist, lowest_rank='phylum', min_num=0, min_hashes=5, prefix='oddities'): """ Collect counts of all the LCAs in the list of databases. """ assert len(dblist) == 1 keep_ranks = ['root'] for rank in lca_utils.taxlist(): keep_ranks.append(rank) if rank == lowest_rank: break print('keeping hashvals at following ranks:', keep_ranks) print('min number of lineages:', min_num) print('min number of shared hashes:', min_hashes) print('---') # gather all hashvalue assignments from across all the databases assignments = defaultdict(set) for lca_db in dblist: for hashval, idx_list in lca_db.hashval_to_idx.items(): if min_num and len(idx_list) < min_num: continue for idx in idx_list: lid = lca_db.idx_to_lid.get(idx) if lid is not None: lineage = lca_db.lid_to_lineage[lid] assignments[hashval].add(lineage) # now convert to trees -> do LCA & counts counts = defaultdict(int) mixdict = defaultdict(set) for hashval, lineages in assignments.items(): # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. debug("{}", lineages) tree = lca_utils.build_tree(lineages) # now find either a leaf or the first node with multiple # children; that's our lowest-common-ancestor node. lca, reason = lca_utils.find_lca(tree) # find cross-superkingdom hashes, and record combinations of lineages # that have them. rank = 'root' if lca: rank = lca[-1].rank if rank in keep_ranks: xx = [] for lineage in lineages: xx.append(tuple(lineage)) xx = tuple(xx) mixdict[xx].add(hashval) counts[lca] += 1 # sort on number of confused hash vals by combination of lineages. mixdict_items = list(mixdict.items()) mixdict_items.sort(key=lambda x: -len(x[1])) confused_hashvals = set() fp = open(prefix + '.csv', 'wt') w = csv.writer(fp) w.writerow([ 'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca', 'ident1', 'lineage1', 'ident2', 'lineage2' ]) # # find candidate lineages, then evaluate pairwise intersections. # for cluster_n, (lineages, hashvals) in enumerate(mixdict_items): # insist on more than N hash vals if len(hashvals) < min_hashes: continue # display summary: print('cluster {} has {} assignments for {} hashvals / {} bp'.format( cluster_n, len(lineages), len(hashvals), dblist[0].scaled * len(hashvals))) confused_hashvals.update(hashvals) tree = lca_utils.build_tree(lineages) lca, reason = lca_utils.find_lca(tree) if lca: rank = lca[-1].rank else: rank = 'root' print(' rank & lca:', rank, lca_utils.display_lineage(lca)) # for lineage_n, lineage in enumerate(lineages): # print('* ', lca_utils.display_lineage(lineage)) # now, identify all members of these lineages by their index: all_idxs = [] for lineage_n, lineage in enumerate(lineages): lids = dblist[0].lineage_to_lids[lineage] for lid in lids: idxs = dblist[0].lid_to_idx[lid] all_idxs.extend(idxs) for idx in idxs: ident = dblist[0].idx_to_ident[idx] # run through and look at all pairs of genomes in these lineages; # filter so that we're comparing across lineages with the right # LCA, and with significant intersection. pair_n = 0 candidates = [] for i in range(len(all_idxs)): idx1 = all_idxs[i] lid1 = dblist[0].idx_to_lid[idx1] lin1 = dblist[0].lid_to_lineage[lid1] for j in range(i): idx2 = all_idxs[j] lid2 = dblist[0].idx_to_lid[idx2] lin2 = dblist[0].lid_to_lineage[lid2] ident1 = dblist[0].idx_to_ident[idx1] ident2 = dblist[0].idx_to_ident[idx2] debug("{} x {}", ident1, ident2) this_tree = lca_utils.build_tree([lin1, lin2]) this_lca, this_reason = lca_utils.find_lca(this_tree) # weed out pairs that don't have the desired intersection if lca != this_lca: continue mh1 = dblist[0]._signatures[idx1] mh2 = dblist[0]._signatures[idx2] mins1 = set(mh1.get_mins()) mins2 = set(mh2.get_mins()) intersect_size = len(mins1.intersection(mins2)) # weed out pairs that don't have enough k-mer intersection if intersect_size < min_hashes: continue candidates.append( (pair_n, ident1, lin1, ident2, lin2, intersect_size)) # write summary to CSV for find-oddities-examine.py to use w.writerow([ 'cluster{}.{}'.format(cluster_n, pair_n), len(lineages), intersect_size * dblist[0].scaled, dblist[0].ksize, rank, lca_utils.display_lineage(lca), ident1, lca_utils.display_lineage(lin1), ident2, lca_utils.display_lineage(lin2) ]) pair_n += 1 print(' Candidate genome pairs for these lineages:') for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates: print(' cluster.pair {}.{} share {} bases'.format( cluster_n, pair_n, intersection_size * dblist[0].scaled)) print(' - {} ({})'.format(ident1, lca_utils.display_lineage(lin1))) print(' - {} ({})'.format(ident2, lca_utils.display_lineage(lin2))) print('') print('') return counts, confused_hashvals
def test_display_lineage_2(): x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] assert display_lineage(x) == "a;;c", display_lineage(x)
def test_display_lineage_1(): x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] assert display_lineage(x) == "a;b", display_lineage(x)