def test_count_lca_for_assignments_abund_3(): # test basic mechanics of gather_assignments function with two lineages # and two hashvals hashval = 12345678 hashval2 = 87654321 hashval_counts = dict() hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 lin = lca_utils.make_lineage('a;b;c') lin2 = lca_utils.make_lineage('a;b;d') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin, lin2 ])) db._set_lineage_assignment(hashval2, set([ lin ])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 2 assert counts[lin] == 5 # makes sense assert counts[lin2] == 0 # makes sense lca_lin = lca_utils.make_lineage('a;b') assert counts[lca_lin] == 2 # yes!
def test_count_lca_for_assignments_abund_4(): # test basic mechanics of gather_assignments function with three lineages # and three hashvals hashval = 12345678 hashval2 = 87654321 hashval3 = 34567891 hashval_counts = dict() hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 hashval_counts[hashval3] = 3 lin = lca_utils.make_lineage('a;b;c') lin2 = lca_utils.make_lineage('a;b;d') lin3 = lca_utils.make_lineage('a;b;d;e') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin, lin2 ])) # lca: a;b db._set_lineage_assignment(hashval2, set([ lin ])) # lca: a;b;c db._set_lineage_assignment(hashval3, set([ lin2, lin3 ])) # a;b;d;e assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 3 assert counts[lin] == 5 # makes sense b/c hashval2 assert counts[lin2] == 0 # a;b;d (lin2) + a;b;d;e (lin3) -->a;b;d;e (lin3) only assert counts[lin3] == 3 # hashval3 lca_lin = lca_utils.make_lineage('a;b') assert counts[lca_lin] == 2 # yes, b/c hashval
def summarize(hashvals, dblist, threshold): """ Classify 'hashvals' using the given list of databases. Insist on at least 'threshold' counts of a given lineage before taking it seriously. Return (lineage, counts) where 'lineage' is a tuple of LineagePairs. """ # gather assignments from across all the databases assignments = lca_utils.gather_assignments(hashvals, dblist) # now convert to trees -> do LCA & counts counts = lca_utils.count_lca_for_assignments(assignments) # ok, we now have the LCAs for each hashval, and their number # of counts. Now aggregate counts across the tree, going up from # the leaves. aggregated_counts = defaultdict(int) for lca, count in counts.most_common(): if count < threshold: break if not lca: aggregated_counts[lca] += count # climb from the lca to the root. aggregated_counts[lca] += count return aggregated_counts
def test_gather_assignments_1(): # test basic mechanics of gather_assignments function hashval = 12345678 lin = lca_utils.make_lineage('a;b;c') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin ])) assignments = lca_utils.gather_assignments([hashval], [db]) print(assignments) assert assignments[hashval] == set([ lin ])
def test_count_lca_for_assignments_abund_1(): # test basic mechanics of gather_assignments function hashval = 12345678 hashval_counts = dict() hashval_counts[hashval] = 3 lin = lca_utils.make_lineage('a;b;c') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin ])) assignments = lca_utils.gather_assignments(hashval_counts.keys(), [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 1 assert counts[lin] == 3
def test_gather_assignments_3(): # test basic mechanics of gather_assignments function with two lineages # and two hashvals hashval = 12345678 hashval2 = 87654321 lin = lca_utils.make_lineage('a;b;c') lin2 = lca_utils.make_lineage('a;b;d') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin, lin2 ])) db._set_lineage_assignment(hashval2, set([ lin ])) assignments = lca_utils.gather_assignments([hashval, hashval2], [db]) print(assignments) assert assignments[hashval] == set([ lin, lin2 ]) assert assignments[hashval2] == set([ lin ])
def test_count_lca_for_assignments_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 lin = lca_utils.make_lineage('a;b;c') lin2 = lca_utils.make_lineage('a;b;d') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin, lin2 ])) assignments = lca_utils.gather_assignments([hashval], [db]) counts = count_lca_for_assignments(assignments) print(counts) assert counts[lin] == 0 assert counts[lin2] == 0 assert len(counts) == 1 lca_lin = lca_utils.make_lineage('a;b') assert counts[lca_lin] == 1
def test_count_lca_for_assignments_abund_5(): # test basic mechanics of gather_assignments function with two lineages # and two hashvals when linages match but one has lower taxo detail hashval = 12345678 hashval2 = 87654321 hashval_counts = dict() hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 lin = lca_utils.make_lineage('a;b;d') lin2 = lca_utils.make_lineage('a;b;d;e') db = FakeLCA_Database() db._set_lineage_assignment(hashval, set([ lin, lin2 ])) db._set_lineage_assignment(hashval2, set([ lin ])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 2 assert counts[lin] == 5 # makes sense assert counts[lin2] == 2 # lin+lin2 yield just lin2