def test_run_raxml(): workdir = "tests/output/test_run_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions # scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=blast_dir) # scraper.align_query_seqs() # scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date))
def test_no_mrca(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/test_mrcalist_local" configfi = "tests/data/test.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) ingroup_mrca = None # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) # print(ids.mrca_ott, ids.mrca_ncbi) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca) filteredScrape.threshold = 5 assert filteredScrape.mrca_ncbi == 18794 blast_dir = "tests/data/precooked/fixed/tte_blast_files" filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.config.blast_loc = 'remote' scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) a = len(scraper.new_seqs) == 40 b = len(scraper.data.aln) == 5 c = len(scraper.new_seqs_otu_id) == 0 scraper.remove_identical_seqs() d = len(scraper.new_seqs) == 40 e = len(scraper.data.aln) == 5 f = len(scraper.new_seqs_otu_id) == 38 g = 1 for taxon in scraper.data.tre.taxon_namespace: h = taxon.label in scraper.data.otu_dict g = g * h status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') i = status in ('original', 'query') g = g * i # Second test checks that seq len prec is affecting results data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) #reload bc data object is mutable data_obj.workdir = absworkdir scraper2 = PhyscraperScrape(data_obj, ids) scraper2.config.blast_loc = 'remote' scraper2.ids.otu_rank = {} scraper2.config.gifilename = False j = len(scraper2.data.aln) == 5 # scraper2.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper2.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") scraper2.config.seq_len_perc = 0.998 # Change seq len percentage from default of 75% k = len(scraper2.new_seqs) == 40 l = len(scraper2.new_seqs_otu_id) == 0 scraper2.remove_identical_seqs() # print(scraper2.data.otu_dict) # print(len(scraper.new_seqs_otu_id), 38) # print(len(scraper2.new_seqs_otu_id), 36) m = len(scraper2.new_seqs_otu_id) == 36 count = 0 assert a * b * c * d * e * f * g * h * i * j * k * l * m == True
def test_sp_seq_d(): absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_sp_d = [] sp_d = filteredScrape.make_sp_dict() for key in sp_d: v = sp_d[key] for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2: not_added = ['deleted', 'subsequence,', 'not'] if v2['^physcraper:status'].split(' ')[0] not in not_added: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:accession']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2 or u'^physcraper:status' in v2: if v2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if v2['^physcraper:last_blasted'] != '1800/01/01': if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) elif '^ot:ottTaxonName' in v2: user_sp_d.append(v2['^ot:ottTaxonName']) filteredScrape.make_sp_seq_dict() gi_sp_seq_d = [] ott_sp_seq_d = [] for v in filteredScrape.sp_seq_d.values(): for k in v.keys(): # print(k) if len(k.split('.')) >= 2: # if type(k) == int: gi_sp_seq_d.append(k) else: # if type(k) == str or type(k) == unicode: ott_sp_seq_d.append(k) # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d)) assert len(ott_sp_seq_d) == len(user_sp_d) assert len(gi_sp_seq_d) == len(gi_sp_d)
def test_read_local_blast(): conf = physcraper.ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() # print("prepare test") for taxonID in filteredScrape.sp_d: if len(filteredScrape.sp_seq_d[taxonID]) > treshold: # print(taxonID) blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0] seq = filteredScrape.sp_seq_d[taxonID][blast_seq] local_blast.write_filterblast_query(filteredScrape.workdir, taxonID, seq, fn=str(taxonID)) # print(filteredScrape.sp_seq_d[taxonID].keys()[1:] ) blast_db = [ item for item in filteredScrape.sp_seq_d[taxonID].keys()[1:] if len(item.split(".")) >= 2 ] # print(blast_db) for blast_key in blast_db: seq = filteredScrape.sp_seq_d[taxonID][blast_key] local_blast.write_filterblast_db(filteredScrape.workdir, blast_key, seq, fn=str(taxonID)) break # print(taxonID) blast_db = taxonID blast_seq = taxonID key = taxonID local_blast.run_filter_blast(filteredScrape.workdir, blast_seq, blast_db) local_blast.read_filter_blast(filteredScrape.workdir, filteredScrape.sp_seq_d[key], blast_db) blast_out = "{}/blast/output_{}_tobeblasted.xml".format(workdir, key) if os.path.exists(blast_out): with open(blast_out) as f: first_line = f.readline() assert len(first_line.strip()) != 0
def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) #print scraper.ncbi_mrca assert (len(scraper.new_seqs) == 0) assert (len(scraper.data.aln) == 5) assert len(scraper.new_seqs_otu_id) == 17 #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test. #TODO find an example where we do get identical sequences and need to discard them # seqset = set() # for otu in scraper.new_seqs_otu_id: # seq = scraper.new_seqs_otu_id[otu] # if seq in seqset: # print otu # seqset.add(seq) #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence. ## for otu in scraper.new_seqs_otu_id: # qseq = scraper.new_seqs_otu_id[otu] # count = 0 # for seq in seqset: # if qseq in seq: # count += 1 # assert count == 1 ## for taxon in scraper.data.tre.taxon_namespace: # assert(taxon.label in scraper.data.otu_dict) # status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') # assert(status in ('original', 'query')) aln_path1 = scraper.data.write_aln() aln_path = scraper.write_all_unaligned('test.fas') scraper.align_query_seqs() assert len(scraper.data.aln) == 22
def test_write_outputinfo(): workdir = "tests/output/test_write_output_files" configfi = "tests/data/test.config" downtorank = None absworkdir = os.path.abspath(workdir) fn_otu = os.path.join(absworkdir, "otu_seq_info.csv") fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv") conf = physcraper.ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() # filteredScrape.sp_dict(downtorank) # filteredScrape.make_sp_seq_dict() filteredScrape.align_query_seqs() wrappers.write_out_files(filteredScrape, downtorank) with open(fn_otu) as fn: line = fn.readline() cnt = 1 while cnt <= 5: line = fn.readline() cnt += 1 assert type(line) == str assert line.split(",") >= 2 with open(fn_sampling) as fn: line = fn.readline() cnt = 1 while cnt <= 5: line = fn.readline() cnt += 1 assert type(line) == str assert line.split(",") >= 2
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ]
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.threshold = threshold filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] filteredScrape.remove_identical_seqs() sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id) assert len(sp_d) == 5 for taxon in sp_d: assert len(sp_d[taxon]) <= threshold
ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) with open(otu_jsonfi, "w") as outfile: json.dump(otu_json, outfile) ottids = [otu_json[ite]['^ot:ottId'] for ite in otu_json] mrca = opentree_helpers.get_mrca_ott(ottids) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=mrca) data_obj.prune_short() data_obj.dump(filename="tests/data/precooked/tiny_dataobj.p") scraper = PhyscraperScrape(data_obj, ids) scraper._blasted = 1 scraper.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/tiny_acc_map.p", "wb")) # pickle.dump(scraper.acc_list_mrca, open("tests/data/precooked/acc_list_mrca.p", "wb"))
def test_internal_mpi(): import pickle import sys import os import subprocess from physcraper import ConfigObj, PhyscraperScrape, IdDicts from mpi4py import MPI # set up until test workdir = "tests/output/test_mpi_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions scraper.read_blast_wrapper(blast_dir=blast_dir) scraper.remove_identical_seqs() scraper.data.write_papara_files() scraper.align_query_seqs() scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date)) # scraper.generate_streamed_alignment() if not os.path.exists("{}/previous_run".format(scraper.workdir)): os.mkdir("{}/previous_run".format(scraper.workdir)) os.system( "mv {}/papara_alignment.extended {}/previous_run/papara_alignment.extended" .format(scraper.workdir, scraper.workdir)) cwd = os.getcwd() # os.chdir(scraper.workdir) ntasks = os.environ.get('SLURM_NTASKS_PER_NODE') nnodes = os.environ.get("SLURM_JOB_NUM_NODES") print(nnodes, ntasks) env_var = int(nnodes) * int(ntasks) #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7) print(env_var) assert os.path.exists("{}/previous_run/papara_alignment.extended".format( scraper.workdir)) with cd(scraper.workdir): print("run with mpi") subprocess.call([ "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m", "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format( scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#", "autoMRE", "-n", "all{}".format(scraper.date) ])
def test_remove_id_seq(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 ############################# id_seq = [ "TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC" ] # print("start test") tmp_dict = dict( (taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln) old_seqs = tmp_dict.keys() avg_seqlen = sum(filteredScrape.data.orig_seqlen) / len( filteredScrape.data.orig_seqlen) assert filteredScrape.config.seq_len_perc <= 1 seq_len_cutoff = avg_seqlen * filteredScrape.config.seq_len_perc count = 1 for item in id_seq: if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff: ott = "OTT_{}".format(count) count += 1 otu_id = ott filteredScrape.data.otu_dict[otu_id] = {} filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300 filteredScrape.data.otu_dict[otu_id][ '^ncbi:accession'] = "KX494441" filteredScrape.data.otu_dict[otu_id][ '^ncbi:title'] = "some random title" filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101 filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott filteredScrape.data.otu_dict[otu_id][ '^physcraper:status'] = "query" filteredScrape.data.otu_dict[otu_id][ '^ot:ottTaxonName'] = "Senecio vulgaris" filteredScrape.data.otu_dict[otu_id][ '^physcraper:last_blasted'] = None filteredScrape.del_superseq = set() filteredScrape.seq_dict_build(item, otu_id, tmp_dict) for tax in old_seqs: try: del tmp_dict[tax] except KeyError: pass filteredScrape.new_seqs_otu_id = tmp_dict expected_add = 1 assert expected_add == len(filteredScrape.new_seqs_otu_id) sys.stdout.write( "todo: add check that newly added seq are checked. they are, but there is no test" )
def test_blacklist(): workdir = "tests/output/test_blacklist" configfi = "tests/data/test.config" # make one run without blacklist blacklist = None noblack = os.path.join(workdir, "noblacklist") absworkdir = os.path.abspath(noblack) if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) noblackScrape = PhyscraperScrape(data_obj, ids) noblackScrape._blasted = 1 src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") # print(dest) full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) noblackScrape.read_blast_wrapper() noblackScrape.remove_identical_seqs() new_test_generate_streamed_aln(noblackScrape) # one run with blacklist blacklist = ['JX895340.1'] absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist filteredScrape._blasted = 1 if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() new_test_generate_streamed_aln(filteredScrape) print("RUN TESTS!") gi_l = [] gi_l_2 = [] for tax in filteredScrape.data.tre.taxon_namespace: gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l.append(gi_id) print(gi_l) for tax in noblackScrape.data.tre.taxon_namespace: # print(filteredScrape.data.otu_dict[tax.label]) gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l_2.append(gi_id) print(gi_l_2) for item in blacklist: assert item not in gi_l