def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir

    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False
    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    #print scraper.ncbi_mrca

    assert (len(scraper.new_seqs) == 0)
    assert (len(scraper.data.aln) == 5)
    assert len(scraper.new_seqs_otu_id) == 17
    #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test.

    #TODO find an example where we do get identical sequences and need to discard them

    #    seqset = set()
    #    for otu in scraper.new_seqs_otu_id:
    #        seq = scraper.new_seqs_otu_id[otu]
    #        if seq in seqset:
    #            print otu
    #        seqset.add(seq)

    #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence.
    ##    for otu in scraper.new_seqs_otu_id:
    #       qseq = scraper.new_seqs_otu_id[otu]
    #       count = 0
    #       for seq in seqset:
    #           if qseq in seq:
    #               count += 1
    #       assert count == 1

    ##    for taxon in scraper.data.tre.taxon_namespace:
    #       assert(taxon.label in scraper.data.otu_dict)
    #       status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
    #       assert(status in ('original', 'query'))

    aln_path1 = scraper.data.write_aln()
    aln_path = scraper.write_all_unaligned('test.fas')
    scraper.align_query_seqs()
    assert len(scraper.data.aln) == 22
Пример #2
0
def test_write_outputinfo():
    workdir = "tests/output/test_write_output_files"
    configfi = "tests/data/test.config"
    downtorank = None
    absworkdir = os.path.abspath(workdir)

    fn_otu = os.path.join(absworkdir, "otu_seq_info.csv")
    fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv")

    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    # filteredScrape.sp_dict(downtorank)
    # filteredScrape.make_sp_seq_dict()

    filteredScrape.align_query_seqs()

    wrappers.write_out_files(filteredScrape, downtorank)

    with open(fn_otu) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2

    with open(fn_sampling) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2
Пример #3
0
def test_internal_mpi():
    import pickle
    import sys
    import os
    import subprocess
    from physcraper import ConfigObj, PhyscraperScrape, IdDicts
    from mpi4py import MPI

    # set up until test
    workdir = "tests/output/test_mpi_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    scraper.remove_identical_seqs()

    scraper.data.write_papara_files()
    scraper.align_query_seqs()
    scraper.place_query_seqs()
    scraper.est_full_tree()

    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
    # scraper.generate_streamed_alignment()
    if not os.path.exists("{}/previous_run".format(scraper.workdir)):
        os.mkdir("{}/previous_run".format(scraper.workdir))
    os.system(
        "mv {}/papara_alignment.extended  {}/previous_run/papara_alignment.extended"
        .format(scraper.workdir, scraper.workdir))

    cwd = os.getcwd()
    # os.chdir(scraper.workdir)

    ntasks = os.environ.get('SLURM_NTASKS_PER_NODE')
    nnodes = os.environ.get("SLURM_JOB_NUM_NODES")
    print(nnodes, ntasks)
    env_var = int(nnodes) * int(ntasks)
    #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7)
    print(env_var)

    assert os.path.exists("{}/previous_run/papara_alignment.extended".format(
        scraper.workdir))
    with cd(scraper.workdir):
        print("run with mpi")
        subprocess.call([
            "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m",
            "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format(
                scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#",
            "autoMRE", "-n", "all{}".format(scraper.date)
        ])