예제 #1
0
def test_remove_taxa_aln_tre():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)

    len_aln_before = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_before = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace)

    for tax in filteredScrape.data.aln.taxon_namespace:
        filteredScrape.data.remove_taxa_aln_tre(tax.label)
        break

    len_aln_after = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_after = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace)

    assert len_aln_before != len_aln_after
    assert len_tre_before != len_tre_after
    assert namespace_before != namespace_after
    assert namespace_tre_before != namespace_tre_after
예제 #2
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    gi_data_otu_dict_added = []
    for v in filteredScrape.data.otu_dict.values():
        if '^ncbi:gi' in v:
            if (v['^physcraper:status'].split(' ')[0]
                    not in filteredScrape.seq_filter):
                gi_data_otu_dict_added.append(v['^ncbi:gi'])
    gi_sp_d = []
    for key in filteredScrape.sp_d:
        v = filteredScrape.sp_d[key]
        for v2 in v:
            if '^ncbi:gi' in v2:
                gi_sp_d.append(v2['^ncbi:gi'])
    user_data_otu_dict = []
    for v in filteredScrape.data.otu_dict.values():
        if '^user:TaxonName' in v:
            user_data_otu_dict.append(v['^user:TaxonName'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            if '^user:TaxonName' in v2:
                user_sp_d.append(v2['^user:TaxonName'])
    assert sorted(gi_data_otu_dict_added) == sorted(gi_sp_d)
    assert sorted(user_data_otu_dict) == sorted(user_sp_d)
예제 #3
0
def test_remove_id_seq():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape =  FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1

    #############################

    id_seq = ["TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC"
                ]

    # print("start test")
    tmp_dict = dict((taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln)
    old_seqs = tmp_dict.keys()
    avg_seqlen = sum(filteredScrape.data.orig_seqlen)/len(filteredScrape.data.orig_seqlen)
    assert filteredScrape.config.seq_len_perc <= 1
    seq_len_cutoff = avg_seqlen*filteredScrape.config.seq_len_perc
    count=1

    for item in id_seq:
        if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff:
            ott = "OTT_{}".format(count)
            count += 1
            otu_id = ott
            filteredScrape.data.otu_dict[otu_id] = {}
            filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300
            filteredScrape.data.otu_dict[otu_id]['^ncbi:accession'] =   "KX494441"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:title'] = "some random title"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101
            filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott
            filteredScrape.data.otu_dict[otu_id]['^physcraper:status'] = "query"
            filteredScrape.data.otu_dict[otu_id]['^ot:ottTaxonName'] = "Senecio vulgaris"
            filteredScrape.data.otu_dict[otu_id]['^physcraper:last_blasted'] = "1800/01/01"
            filteredScrape.seq_dict_build(item, otu_id, tmp_dict)
    for tax in old_seqs:
        try:
            del tmp_dict[tax]
        except KeyError:
            pass
    filteredScrape.new_seqs_otu_id = tmp_dict
    expected_add = 1
    assert expected_add == len(filteredScrape.new_seqs_otu_id)
    sys.stdout.write("todo: add check that newly added seq are checked. they are, but there is no test")
예제 #4
0
def test_sp_seq_d():

    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]

    gi_sp_d = []
    for key in filteredScrape.sp_d:
        v = filteredScrape.sp_d[key]
        for v2 in v:
            if '^physcraper:status' in v2:
                not_added = ['deleted', 'subsequence,', 'not']
                if v2['^physcraper:status'].split(' ')[0] not in not_added:
                    if '^ncbi:gi' in v2:
                        gi_sp_d.append(v2['^ncbi:accession'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            if '^physcraper:status' in v2 or u'^physcraper:status' in v2:
                if v2['^physcraper:status'].split(
                        ' ')[0] not in filteredScrape.seq_filter:
                    if v2['^physcraper:last_blasted'] != '1800/01/01':
                        if '^user:TaxonName' in v2:
                            user_sp_d.append(v2['^user:TaxonName'])
                        elif '^ot:ottTaxonName' in v2:
                            user_sp_d.append(v2['^ot:ottTaxonName'])
    filteredScrape.make_sp_seq_dict()
    gi_sp_seq_d = []
    ott_sp_seq_d = []
    for v in filteredScrape.sp_seq_d.values():
        for k in v.keys():
            # print(k)
            if len(k.split('.')) >= 2:
                # if type(k) == int:
                gi_sp_seq_d.append(k)
            else:
                # if type(k) == str or type(k) == unicode:
                ott_sp_seq_d.append(k)
    # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d))
    assert len(ott_sp_seq_d) == len(user_sp_d)
    assert len(gi_sp_seq_d) == len(gi_sp_d)
예제 #5
0
def test_blacklist():

    workdir = "tests/output/test_blacklist"
    configfi = "tests/data/test.config"

    # make one run without blacklist
    debug("run without blacklist")
    blacklist = None
    noblack = os.path.join(workdir, "noblacklist")
    absworkdir = os.path.abspath(noblack)
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))

    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    noblackScrape = FilterBlast(data_obj, ids)
    noblackScrape._blasted = 1
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        # print(dest)
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    noblackScrape.read_blast_wrapper()
    noblackScrape.remove_identical_seqs()
    noblackScrape.generate_streamed_alignment()

    # one run with blacklist
    debug("run with blacklist")

    blacklist = ['JX895340.1']
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.blacklist = blacklist
    filteredScrape._blasted = 1
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper()
    filteredScrape.remove_identical_seqs()
    filteredScrape.generate_streamed_alignment()

    print("RUN TESTS!")
    gi_l = []
    gi_l_2 = []
    for tax in filteredScrape.data.tre.taxon_namespace:
        gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l.append(gi_id)
    print(gi_l)
    for tax in noblackScrape.data.tre.taxon_namespace:
        # print(filteredScrape.data.otu_dict[tax.label])
        gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l_2.append(gi_id)
    print(gi_l_2)
    for item in blacklist:
        assert item not in gi_l
        print("RUN TESTS2!")
        assert item in gi_l_2

        #     # print("seq was not added in blacklist run")
        #     print("inbetween step works")
# test if it removes blacklist gi from already added aln:
    print("run with later blacklist")

    # else:
    #     print("blacklist gi was added in previous run")
    # print("now we want to remove it.")
    len_before = (len(noblackScrape.data.tre.taxon_namespace))
    noblackScrape.blacklist = blacklist
    noblackScrape.generate_streamed_alignment()
    assert len_before - 1 == len(noblackScrape.data.tre.taxon_namespace)
예제 #6
0
def test_add_local():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    if os.path.exists(otu_jsonfi_local):
        otu_json_local = json.load(open(otu_jsonfi_local))
    else:
        otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
        json.dump(otu_json_local, open(otu_jsonfi_local, "w"))

    sys.stdout.write("\ntest addLocal\n")

    # Prune sequences below a certain length threshold
    data_obj.prune_short()
    data_obj.write_files()
    data_obj.write_labelled(label='^ot:ottTaxonName', add_gb_id=True)
    data_obj.write_otus("otu_info", schema='table')
    data_obj.dump()

    sys.stdout.write("setting up id dictionaries\n")
    sys.stdout.flush()

    ids = IdDicts(conf, workdir=workdir)

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.blacklist = blacklist

    if add_local_seq is not None:
        filteredScrape.unpublished = True
    if filteredScrape.unpublished is True:  # use unpublished data
        # filteredScrape.unpublished = True
        filteredScrape.data.unpubl_otu_json = otu_json_local
        filteredScrape.write_unpubl_blastdb(add_local_seq)

        # filteredScrape.make_otu_dict_entry_unpubl()
        filteredScrape.run_blast_wrapper()
        filteredScrape.read_blast_wrapper()
        filteredScrape.remove_identical_seqs()

    test = False
    for key in filteredScrape.data.otu_dict.keys():
        if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys():
            if filteredScrape.data.otu_dict[key][
                    '^ncbi:title'] == "unpublished":
                test = True
                break
    assert test == True
예제 #7
0
def run_with_settings(settings):
    """looks for pickeled file to continue run, or builds and runs
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output."""
    debug("Debugging mode is on")
    if os.path.isfile("{}/scrape_checkpoint.p".format(settings.workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(
            open("{}/scrape_checkpoint.p".format(settings.workdir), "rb")
        )
        filteredScrape.repeat = 1
    else:
        conf = ConfigObj(settings.configfi)
        # print("config")
        debug(dir(conf))
        debug(conf.email)

        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=settings.seqaln, 
                                           mattype=settings.mattype,
                                           workdir=settings.workdir,
                                           treefile=settings.trfn,
                                           schema_trf=settings.schema_trf,
                                           otu_json=settings.spInfoDict,
                                           ingroup_mrca=None)

        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()

        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        ids = IdDicts(conf, workdir=settings.workdir)

        filteredScrape = FilterBlast(data_obj, ids, settings)
        filteredScrape.add_setting_to_self(settings.downtorank, settings.threshold)

        filteredScrape.write_otu_info(settings.downtorank)

        if settings.add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.write_unpubl_blastdb(settings.add_unpubl_seq)
            filteredScrape.run_blast_wrapper(settings.delay)
            filteredScrape.local_otu_json = settings.id_to_spn_addseq_json
            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False

        # run the ananlyses
        if filteredScrape.unpublished is not True:
            filteredScrape.run_blast_wrapper(settings.delay)
            filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            if settings.threshold is not None:
                filteredScrape.sp_dict(settings.downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby)
                filteredScrape.replace_new_seq()
            debug("from replace to streamed aln")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.dump()
    while filteredScrape.repeat is 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        filteredScrape.run_blast_wrapper(settings.delay)
        filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        if settings.threshold is not None:
            filteredScrape.sp_dict(settings.downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(settings.downtorank)
        return filteredScrape
예제 #8
0
def filter_data_run(seqaln,
                    mattype,
                    trfn,
                    schema_trf,
                    workdir,
                    threshold,
                    spInfoDict,
                    configfi,
                    selectby="blast",
                    downtorank=None,
                    blacklist=None,
                    add_unpubl_seq=None,
                    id_to_spn_addseq_json=None,
                    ingroup_mrca=None,
                    shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs 
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output.
    """
    debug("Debugging mode is on")

    # debug(shared_blast_folder)
    # debug(some)
    # if _DEBUG_MK == 1:
    #     random.seed(3269235691)
    print(workdir)
    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb'))
        filteredScrape.repeat = 1   
    else:   
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=True)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=spInfoDict,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequnces below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)

        # Now combine the data, the ids, and the configuration into a single physcraper scrape object
        filteredScrape = FilterBlast(data_obj, ids)
        filteredScrape.add_setting_to_self(downtorank, threshold)
        filteredScrape.blacklist = blacklist
        if add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.unpublished = True
            filteredScrape.write_unpubl_blastdb(add_unpubl_seq)
            filteredScrape.run_blast_wrapper(delay=14)
            print("add unpubl otu json")
            filteredScrape.data.unpubl_otu_json = id_to_spn_addseq_json
            print(filteredScrape.data.unpubl_otu_json)

            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False
        else:
            # run the analysis
            sys.stdout.write("BLASTing input sequences\n")
            if shared_blast_folder:
                filteredScrape.blast_subdir = shared_blast_folder
            else:
                shared_blast_folder = None
            filteredScrape.run_blast_wrapper(delay=14)
            filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            sys.stdout.write("Filter the sequences\n")
            if threshold is not None:

                filteredScrape.sp_dict(downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
                filteredScrape.replace_new_seq()
            sys.stdout.write("Calculate the phylogeny\n")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.data.write_otus("otu_info", schema="table")
            filteredScrape.write_otu_info(downtorank)

            filteredScrape.dump()
    while filteredScrape.repeat == 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        sys.stdout.write("BLASTing input sequences\n")
        if shared_blast_folder:
            filteredScrape.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        filteredScrape.run_blast_wrapper(delay=14)
        filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        sys.stdout.write("Filter the sequences\n")
        if threshold is not None:
            filteredScrape.sp_dict(downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.data.prune_short(0.75)
        sys.stdout.write("calculate the phylogeny\n")
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(downtorank)
        # print(some)
    filteredScrape.write_otu_info(downtorank)
    return filteredScrape
예제 #9
0
def test_loop_for_write_blast_files():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.add_setting_to_self(downtorank, threshold)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    # this is the code of the first part of how many seq to keep. if threshold is bigger than number of seq for sp, just add all
    # print("run loop which we want to test")
    for key in filteredScrape.sp_d:
        if len(filteredScrape.sp_d[key]) > threshold:
            count_dict = filteredScrape.count_num_seq(key)
            if key in filteredScrape.sp_seq_d.keys():
                seq_present = count_dict["seq_present"]
                query_count = count_dict["query_count"]
                if seq_present >= 1 and seq_present < threshold and count_dict[
                        "new_taxon"] is False and query_count != 0:
                    if query_count + seq_present > threshold:
                        taxonfn = filteredScrape.loop_for_write_blast_files(
                            key)

# MAKE TEST FOR loop_for_write_blast_files

    for key in filteredScrape.sp_d:
        count = 0
        count_int = 0
        count_gi_file = 0
        count_str_file = 0
        db = False
        blasted = False
        if len(filteredScrape.sp_d[key]) > threshold:
            for sp_keys in filteredScrape.sp_seq_d[key].keys():
                if isinstance(sp_keys, str):
                    count += 1
                if isinstance(sp_keys, unicode):
                    count += 1
                else:
                    count_int += 1
            folder = '{}/blast/'.format(filteredScrape.workdir)
            for the_file in os.listdir(folder):
                spn = the_file.split("_")[0]
                spn = "_".join(the_file.split("_")[0])
                file_type = the_file.split("_")[1]
                if spn == key and file_type == "db":  #
                    db = True
                    f = open('{}/blast/{}'.format(filteredScrape.workdir,
                                                  the_file))
                    for line in iter(f):
                        if line[0] == ">":
                            count_gi_file += 1
                if spn == key and file_type == "tobeblasted":
                    blasted = True
                    count_str_file += 1
            if blasted:
                if count + count_int != threshold:
                    assert count_str_file == count
            if db:
                if count + count_int != threshold:
                    assert count_gi_file == count_int
예제 #10
0
def test_write_blast():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    for taxonID in filteredScrape.sp_d:
        if len(filteredScrape.sp_seq_d[taxonID]) > treshold:
            blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0]
            seq = filteredScrape.sp_seq_d[taxonID][blast_seq]
            local_blast.write_filterblast_files(workdir, taxonID, seq)
            blast_db = filteredScrape.sp_seq_d[taxonID].keys()[1:]
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[taxonID][blast_key]
                local_blast.write_filterblast_files(workdir,
                                                    blast_key,
                                                    seq,
                                                    db=True,
                                                    fn=str(taxonID))
            break

    blast_file_blast = "{}/blast/{}_tobeblasted".format(workdir, taxonID)
    # print(blast_file_blast)
    blast_file_db = "{}/blast/{}_db".format(workdir, taxonID)
    # print(blast_file_db, blast_file_blast)
    if os.path.exists(blast_file_blast):
        with open(blast_file_blast) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
    if os.path.exists(blast_file_db):
        with open(blast_file_db) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
예제 #11
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    for key in filteredScrape.sp_d:
        if len(filteredScrape.sp_d[key]) <= treshold:
            filteredScrape.add_all(key)
    treshold_undermin = 0
    for key in filteredScrape.sp_d:
        for key2 in filteredScrape.sp_d[key]:
            if len(filteredScrape.sp_d[key]) <= treshold:
                if '^physcraper:status' in key2:
                    if key2['^physcraper:status'].split(
                            ' ')[0] not in filteredScrape.seq_filter:
                        if key2['^physcraper:last_blasted'] == '1800/01/01':
                            treshold_undermin += 1
    add_all_thresholdmin = filteredScrape.filtered_seq
    assert treshold_undermin == len(add_all_thresholdmin)