Пример #1
0
def test_searchfiles_contigs_just_search(location):
    prefix = os.path.join(location, "pref")
    filelist = [
        f"{prefix}.contigs.ranksearch.csv",
        f"{prefix}.contigs.ranksearch.matches.sig",
        f"{prefix}.contigs.search.csv", f"{prefix}.contigs.search.matches.sig",
        f"{prefix}.contigs.unmatched.fq"
    ]

    sf = SearchFiles(prefix, search=True, gather=True)

    # two minhashes, share ranks at phylum level
    hashval = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c')
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;f')
    # create lca_db w sigs
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)
    # make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)
    num_hashes = 2
    # search with combined hashvals
    search_mh = make_mh([hashval, hashval2])
    results, rank_results = search_containment_at_rank(search_mh, lca_db,
                                                       lin_db, "class")
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))

    #write search results
    name = 'name'
    seq_len = 6
    for res in results:
        sf.write_result(res, name, seq_len, result_type="search")
    for res in rank_results:
        sf.write_result(res, name, seq_len, result_type="ranksearch")

    sf.close()

    # check results are in files
    for f in filelist:
        assert os.path.exists(f)

    with open(f"{prefix}.contigs.search.csv", "r") as searchres:
        this_search_csvset = get_csv_set(searchres)
    with open(utils.get_testfile("test-data/test.contigs.search.csv"),
              "r") as searchres:
        saved_search_csvset = get_csv_set(searchres)
    assert saved_search_csvset == this_search_csvset

    with open(f"{prefix}.contigs.ranksearch.csv", "r") as searchres:
        this_ranksearch_csvset = get_csv_set(searchres)
    with open(utils.get_testfile("test-data/test.contigs.ranksearch.csv"),
              "r") as searchres:
        saved_ranksearch_csvset = get_csv_set(searchres)
    assert saved_ranksearch_csvset == this_ranksearch_csvset
Пример #2
0
def test_gather_guess_tax_at_rank_1():
    # one minhash, one set of ranks
    hashval = 12345678
    ident = 'uniq'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident, 'a;b;c')

    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident)

    lin_db = LineageDB()
    lin_db.insert(ident, lin1)

    num_hashes = 1
    phylum_match_lin = lca_utils.make_lineage('a;b')

    gather_results = list(gather_at_rank(mh1, lca_db, lin_db, "class"))
    phylum_results = gather_guess_tax_at_rank(gather_results,
                                              num_hashes,
                                              "phylum",
                                              minimum_matches=1)

    assert len(phylum_results) == 3

    assert phylum_results[0] == phylum_match_lin
    assert phylum_results[1] == 1.0
Пример #3
0
def test_gather_at_rank_3():
    # two minhashes, totally distinct ranks
    # first sig
    hashval1 = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c')

    # second sig
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f')

    # create lca_db w sig1
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)

    # next, make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)

    # search with combined hashvals
    search_mh = make_mh([hashval1, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))

    assert len(gather_results) == 2
    assert set([gather_results[0][0],
                gather_results[1][0]]) == set([lin1, lin2])
    assert set([gather_results[0][1], gather_results[1][1]]) == set([1])
Пример #4
0
def test_gather_at_rank_2():
    #two minhashes, fully shared ranks

    # first sig
    hashval = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c')

    # second sig
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c')

    # create lca_db w sigs
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)

    # make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)

    # search with combined hashvals
    search_mh = make_mh([hashval, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))
    assert len(gather_results) == 1
    assert gather_results[0][0] == lin1
    assert gather_results[0][1] == 2
Пример #5
0
def test_gather_at_rank_1():
    # one minhash, one set of ranks
    hashval = 12345678
    ident = 'uniq'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident, 'a;b;c')

    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident)

    lin_db = LineageDB()
    lin_db.insert(ident, lin1)

    gather_results = list(gather_at_rank(mh1, lca_db, lin_db, "class"))
    assert len(gather_results) == 1
    assert gather_results[0][0] == lin1
    assert gather_results[0][1] == 1
Пример #6
0
def test_gather_guess_tax_at_each_rank_1():
    #two minhashes, fully shared ranks

    # first sig
    hashval = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c')

    # second sig
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c')

    # create lca_db w sigs
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)

    # make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)

    num_hashes = 2
    superk_lin = lca_utils.make_lineage('a')
    phylum_lin = lca_utils.make_lineage('a;b')

    # search with combined hashvals
    search_mh = make_mh([hashval, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))
    rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \
                                               lowest_rank="class",
                                               taxlist=lca_utils.taxlist(include_strain=False))

    assert len(rank_results) == 3

    assert rank_results[0] == RankSumGatherResult(lineage=superk_lin,
                                                  f_ident=1.0,
                                                  f_major=1.0)
    assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin,
                                                  f_ident=1.0,
                                                  f_major=1.0)
    assert rank_results[2] == RankSumGatherResult(lineage=lin1,
                                                  f_ident=1.0,
                                                  f_major=1.0)
Пример #7
0
def test_gather_guess_tax_at_each_rank_3():
    # two minhashes, totally distinct ranks
    # first sig
    hashval1 = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c')

    # second sig
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f')

    # create lca_db w sig1
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)

    # next, make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)

    num_hashes = 2
    #winner seems to be def lineage.. will this remain true always?
    superk_lin = lca_utils.make_lineage('d')
    phylum_lin = lca_utils.make_lineage('d;e')

    # search with combined hashvals
    search_mh = make_mh([hashval1, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))
    rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \
                                               lowest_rank="class",
                                               taxlist=lca_utils.taxlist(include_strain=False))
    assert len(rank_results) == 3

    assert rank_results[0] == RankSumGatherResult(lineage=superk_lin,
                                                  f_ident=1.0,
                                                  f_major=0.5)
    assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin,
                                                  f_ident=1.0,
                                                  f_major=0.5)
    assert rank_results[2] == RankSumGatherResult(lineage=lin2,
                                                  f_ident=1.0,
                                                  f_major=0.5)
Пример #8
0
def main(args):
    "Main entry point for scripting. Use cmdline for command line entry."
    genomebase = os.path.basename(args.genome)
    match_rank = 'genus'

    # load taxonomy CSV
    tax_assign, _ = load_taxonomy_assignments(args.lineages_csv,
                                              start_column=2)
    print(f'loaded {len(tax_assign)} tax assignments.')

    # load the genome signature
    genome_sig = sourmash.load_one_signature(args.genome_sig,
                                             select_moltype=args.alphabet,
                                             ksize=args.ksize)

    # load all of the matches from search --containment in the database
    with open(args.matches_sig, 'rt') as fp:
        try:
            siglist = list(
                sourmash.load_signatures(fp, do_raise=True, quiet=False))
        except sourmash.exceptions.SourmashError:
            siglist = []
    print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")

    # Hack for examining members of our search database: remove exact matches.
    new_siglist = []
    for ss in siglist:
        if genome_sig.similarity(ss) == 1.0:
            print(f'removing an identical match: {ss.name()}')
        else:
            new_siglist.append(ss)
    siglist = new_siglist

    if not siglist:
        # write empty files so snakemake workflows don't complain; exit.
        print('no non-identical matches for this genome, exiting.')
        if not args.no_search_contigs:
            sf = SearchFiles(args.output_prefix,
                             not args.no_search,
                             args.gather,
                             contigs=True)
            sf.close()
        if args.search_genome:
            gf = SearchFiles(args.output_prefix,
                             not args.no_search,
                             args.gather,
                             contigs=False)
            gf.close()
        return 0

    # construct a template minhash object that we can use to create new 'uns
    empty_mh = siglist[0].minhash.copy_and_clear()
    ksize = empty_mh.ksize
    scaled = empty_mh.scaled
    moltype = empty_mh.moltype

    # create empty LCA database to populate...
    lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype)
    lin_db = LineageDB()

    # ...with specific matches.
    for ss in siglist:
        ident = get_ident(ss)
        lineage = tax_assign[ident]

        lca_db.insert(ss, ident=ident)
        lin_db.insert(ident, lineage)

    print(f'loaded {len(siglist)} signatures & created LCA Database')
    print('')
    print(f'reading contigs from {genomebase}')

    screed_iter = screed.open(args.genome)
    genome_len = 0

    if not args.no_search_contigs:
        sf = SearchFiles(args.output_prefix,
                         not args.no_search,
                         args.gather,
                         contigs=True)

        for n, record in enumerate(screed_iter):
            # look at each contig individually
            mh = empty_mh.copy_and_clear()
            mh.add_sequence(record.sequence, force=True)
            # search, optionally aggregate matched hashes to get containment at rank

            seq_len = len(record.sequence)
            genome_len += seq_len
            num_hashes = len(mh.hashes)

            if not args.no_search:
                search_results, search_rank_results = search_containment_at_rank(
                    mh, lca_db, lin_db, match_rank)

                if not search_results:
                    # write to unclassified
                    sf.unmatched.write(">" + record.name + "\n" +
                                       record.sequence + "\n")
                    continue  # if no search results, don't bother with gather
                else:
                    # first, print normal search --containment results
                    for sr in search_results:
                        sf.write_result(sr,
                                        record.name,
                                        seq_len,
                                        result_type="search")
                    # now, print containment at rank results
                    for sr in search_rank_results:
                        sf.write_result(sr,
                                        record.name,
                                        seq_len,
                                        result_type="ranksearch")

            if args.gather:
                # first, gather at match rank (default genus)
                gather_results = list(
                    gather_at_rank(mh, lca_db, lin_db, match_rank))
                # write standard gather_results?

                if not gather_results:
                    # write to unclassified. should only get here if no search OR gather results
                    sf.unmatched.write(">" + record.name + "\n" +
                                       record.sequence + "\n")
                else:
                    # next, summarize at higher ranks
                    gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \
                                                                             minimum_matches=args.gather_min_matches, \

            lowest_rank=match_rank, \
                                                                             taxlist=lca_utils.taxlist(include_strain=False))
                    #results = list of RankSumGatherResult = namedtuple('RankSumGatherResult', 'lineage, f_ident, f_major')

                    # write taxonomy out
                    for gr in gather_taxonomy_per_rank:
                        sf.write_result(gr,
                                        record.name,
                                        seq_len,
                                        result_type="rankgather")

        print(f"Processed {n+1} contigs.")
        # close contig files
        sf.close()

    if args.search_genome:
        gf = SearchFiles(args.output_prefix,
                         not args.no_search,
                         args.gather,
                         contigs=False)
        # MAG workflow
        entire_mh = genome_sig.minhash
        genome_name = genome_sig.name()
        num_hashes = len(entire_mh.hashes)
        if not genome_len:
            for record in screed_iter:
                genome_len += len(record.sequence)
        if not args.no_search:
            #results are guaranteed, otherwise would have exited before searching
            search_results, search_rank_results = search_containment_at_rank(
                entire_mh, lca_db, lin_db, match_rank)
            for sr in search_results:
                gf.write_result(sr,
                                genome_name,
                                genome_len,
                                result_type="search")
            for sr in search_rank_results:
                gf.write_result(sr,
                                genome_name,
                                genome_len,
                                result_type="ranksearch")
        if args.gather:
            gather_results = list(
                gather_at_rank(entire_mh, lca_db, lin_db, match_rank))
            # next, summarize at higher ranks
            gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \
                                                                     minimum_matches=args.gather_min_matches, \
                                                                     lowest_rank=match_rank, \
                                                                     taxlist=lca_utils.taxlist(include_strain=False))
            for gather_res in gather_taxonomy_per_rank:
                gf.write_result(gather_res,
                                genome_name,
                                genome_len,
                                result_type="rankgather")
        # close genome files
        gf.close()

    return 0