def test_no_mrca(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/test_mrcalist_local" configfi = "tests/data/test.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) ingroup_mrca = None # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) # print(ids.mrca_ott, ids.mrca_ncbi) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca) filteredScrape.threshold = 5 assert filteredScrape.mrca_ncbi == 18794 blast_dir = "tests/data/precooked/fixed/tte_blast_files" filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
def test_remove_taxa_aln_tre(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) len_aln_before = len(filteredScrape.data.aln.as_string('phylip')) len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_before = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace) for tax in filteredScrape.data.aln.taxon_namespace: filteredScrape.data.remove_taxa_aln_tre(tax.label) break len_aln_after = len(filteredScrape.data.aln.as_string('phylip')) len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_after = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace) assert len_aln_before != len_aln_after assert len_tre_before != len_tre_after assert namespace_before != namespace_after assert namespace_tre_before != namespace_tre_after
def test_run_raxml(): workdir = "tests/output/test_run_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions # scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=blast_dir) # scraper.align_query_seqs() # scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date))
def test_filter_length(): workdir = "tests/output/test_selectbylength" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) threshold = 2 selectby = "length" downtorank = "species" add_unpubl_seq = None blacklist = None id_to_spn_addseq_json = None ingroup_mrca = None shared_blast_folder = None data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None # filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") length_unfiltered = len(filteredScrape.new_seqs_otu_id) # if threshold is not None: # filteredScrape.filter_seqs() length_filtered = len(filteredScrape.new_seqs)
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.threshold = threshold filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] filteredScrape.remove_identical_seqs() sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id) assert len(sp_d) == 5 for taxon in sp_d: assert len(sp_d[taxon]) <= threshold
def test_write_outputinfo(): workdir = "tests/output/test_write_output_files" configfi = "tests/data/test.config" downtorank = None absworkdir = os.path.abspath(workdir) fn_otu = os.path.join(absworkdir, "otu_seq_info.csv") fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv") conf = physcraper.ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() # filteredScrape.sp_dict(downtorank) # filteredScrape.make_sp_seq_dict() filteredScrape.align_query_seqs() wrappers.write_out_files(filteredScrape, downtorank) with open(fn_otu) as fn: line = fn.readline() cnt = 1 while cnt <= 5: line = fn.readline() cnt += 1 assert type(line) == str assert line.split(",") >= 2 with open(fn_sampling) as fn: line = fn.readline() cnt = 1 while cnt <= 5: line = fn.readline() cnt += 1 assert type(line) == str assert line.split(",") >= 2
def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) #print scraper.ncbi_mrca assert (len(scraper.new_seqs) == 0) assert (len(scraper.data.aln) == 5) assert len(scraper.new_seqs_otu_id) == 17 #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test. #TODO find an example where we do get identical sequences and need to discard them # seqset = set() # for otu in scraper.new_seqs_otu_id: # seq = scraper.new_seqs_otu_id[otu] # if seq in seqset: # print otu # seqset.add(seq) #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence. ## for otu in scraper.new_seqs_otu_id: # qseq = scraper.new_seqs_otu_id[otu] # count = 0 # for seq in seqset: # if qseq in seq: # count += 1 # assert count == 1 ## for taxon in scraper.data.tre.taxon_namespace: # assert(taxon.label in scraper.data.otu_dict) # status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') # assert(status in ('original', 'query')) aln_path1 = scraper.data.write_aln() aln_path = scraper.write_all_unaligned('test.fas') scraper.align_query_seqs() assert len(scraper.data.aln) == 22
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ]
def test_internal_mpi(): import pickle import sys import os import subprocess from physcraper import ConfigObj, PhyscraperScrape, IdDicts from mpi4py import MPI # set up until test workdir = "tests/output/test_mpi_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions scraper.read_blast_wrapper(blast_dir=blast_dir) scraper.remove_identical_seqs() scraper.data.write_papara_files() scraper.align_query_seqs() scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date)) # scraper.generate_streamed_alignment() if not os.path.exists("{}/previous_run".format(scraper.workdir)): os.mkdir("{}/previous_run".format(scraper.workdir)) os.system( "mv {}/papara_alignment.extended {}/previous_run/papara_alignment.extended" .format(scraper.workdir, scraper.workdir)) cwd = os.getcwd() # os.chdir(scraper.workdir) ntasks = os.environ.get('SLURM_NTASKS_PER_NODE') nnodes = os.environ.get("SLURM_JOB_NUM_NODES") print(nnodes, ntasks) env_var = int(nnodes) * int(ntasks) #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7) print(env_var) assert os.path.exists("{}/previous_run/papara_alignment.extended".format( scraper.workdir)) with cd(scraper.workdir): print("run with mpi") subprocess.call([ "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m", "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format( scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#", "autoMRE", "-n", "all{}".format(scraper.date) ])
def test_read_local_blast(): conf = physcraper.ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() # print("prepare test") for taxonID in filteredScrape.sp_d: if len(filteredScrape.sp_seq_d[taxonID]) > treshold: # print(taxonID) blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0] seq = filteredScrape.sp_seq_d[taxonID][blast_seq] local_blast.write_filterblast_query(filteredScrape.workdir, taxonID, seq, fn=str(taxonID)) # print(filteredScrape.sp_seq_d[taxonID].keys()[1:] ) blast_db = [ item for item in filteredScrape.sp_seq_d[taxonID].keys()[1:] if len(item.split(".")) >= 2 ] # print(blast_db) for blast_key in blast_db: seq = filteredScrape.sp_seq_d[taxonID][blast_key] local_blast.write_filterblast_db(filteredScrape.workdir, blast_key, seq, fn=str(taxonID)) break # print(taxonID) blast_db = taxonID blast_seq = taxonID key = taxonID local_blast.run_filter_blast(filteredScrape.workdir, blast_seq, blast_db) local_blast.read_filter_blast(filteredScrape.workdir, filteredScrape.sp_seq_d[key], blast_db) blast_out = "{}/blast/output_{}_tobeblasted.xml".format(workdir, key) if os.path.exists(blast_out): with open(blast_out) as f: first_line = f.readline() assert len(first_line.strip()) != 0
data_obj = generate_ATT_from_phylesystem(aln, "tmp", study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) data_obj.prune_short() data_obj.write_files() data_obj.write_labelled() ids = IdDicts(conf, "tmp") scraper = PhyscraperScrape(data_obj, ids, conf) scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment() #otu_json = "tests/minitest_otu.json" #treefile = "tests/minitest.tre" #info_obj2 = StudyInfo(seqaln, # mattype,
def PS_standard_run(data_obj, ids, shared_blast_folder): """ This is the standard mode for a Physcraper run: update aln and tre as long as new seqs are found, no filtering. :param data_obj: ATT object :param ids: IdDict object :param shared_blast_folder: path to folder for shared blast runs :return: PS run """ if os.path.isfile("{}/scrape_checkpoint.p".format(data_obj.workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") scraper = pickle.load( open("{}/scrape_checkpoint.p".format(data_obj.workdir), 'rb')) scraper.repeat = 1 else: scraper = PhyscraperScrape(data_obj, ids, ingroup_mrca) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.dump("scrape_checkpoint.p") while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.dump() write_out_files(scraper) writeinfofiles.get_additional_GB_info(scraper) return scraper
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi): if os.path.isfile("{}/scrape.p".format(workdir)): sys.stdout.write("Readloading from pickled scrapefile") scraper = pickle.load(open("{}/scrape.p".format(workdir),'rb')) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() #read the config file into a configuration object conf = ConfigObj(configfi) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) #Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) #Prune sequnces below a certain length threshold #This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled() #Mapping identifiers between OpenTree and NCBI requires and identifier dict object ids = IdDicts(conf, workdir="example") #Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids, conf) #run the ananlyses scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment()
def test_remove_id_seq(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 ############################# id_seq = [ "TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC" ] # print("start test") tmp_dict = dict( (taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln) old_seqs = tmp_dict.keys() avg_seqlen = sum(filteredScrape.data.orig_seqlen) / len( filteredScrape.data.orig_seqlen) assert filteredScrape.config.seq_len_perc <= 1 seq_len_cutoff = avg_seqlen * filteredScrape.config.seq_len_perc count = 1 for item in id_seq: if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff: ott = "OTT_{}".format(count) count += 1 otu_id = ott filteredScrape.data.otu_dict[otu_id] = {} filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300 filteredScrape.data.otu_dict[otu_id][ '^ncbi:accession'] = "KX494441" filteredScrape.data.otu_dict[otu_id][ '^ncbi:title'] = "some random title" filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101 filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott filteredScrape.data.otu_dict[otu_id][ '^physcraper:status'] = "query" filteredScrape.data.otu_dict[otu_id][ '^ot:ottTaxonName'] = "Senecio vulgaris" filteredScrape.data.otu_dict[otu_id][ '^physcraper:last_blasted'] = None filteredScrape.del_superseq = set() filteredScrape.seq_dict_build(item, otu_id, tmp_dict) for tax in old_seqs: try: del tmp_dict[tax] except KeyError: pass filteredScrape.new_seqs_otu_id = tmp_dict expected_add = 1 assert expected_add == len(filteredScrape.new_seqs_otu_id) sys.stdout.write( "todo: add check that newly added seq are checked. they are, but there is no test" )
def test_blacklist(): workdir = "tests/output/test_blacklist" configfi = "tests/data/test.config" # make one run without blacklist blacklist = None noblack = os.path.join(workdir, "noblacklist") absworkdir = os.path.abspath(noblack) if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) noblackScrape = PhyscraperScrape(data_obj, ids) noblackScrape._blasted = 1 src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") # print(dest) full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) noblackScrape.read_blast_wrapper() noblackScrape.remove_identical_seqs() new_test_generate_streamed_aln(noblackScrape) # one run with blacklist blacklist = ['JX895340.1'] absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist filteredScrape._blasted = 1 if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() new_test_generate_streamed_aln(filteredScrape) print("RUN TESTS!") gi_l = [] gi_l_2 = [] for tax in filteredScrape.data.tre.taxon_namespace: gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l.append(gi_id) print(gi_l) for tax in noblackScrape.data.tre.taxon_namespace: # print(filteredScrape.data.otu_dict[tax.label]) gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l_2.append(gi_id) print(gi_l_2) for item in blacklist: assert item not in gi_l
ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) with open(otu_jsonfi, "w") as outfile: json.dump(otu_json, outfile) ottids = [otu_json[ite]['^ot:ottId'] for ite in otu_json] mrca = opentree_helpers.get_mrca_ott(ottids) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=mrca) data_obj.prune_short() data_obj.dump(filename="tests/data/precooked/tiny_dataobj.p") scraper = PhyscraperScrape(data_obj, ids) scraper._blasted = 1 scraper.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/tiny_acc_map.p", "wb")) # pickle.dump(scraper.acc_list_mrca, open("tests/data/precooked/acc_list_mrca.p", "wb"))
def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.config.blast_loc = 'remote' scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) a = len(scraper.new_seqs) == 40 b = len(scraper.data.aln) == 5 c = len(scraper.new_seqs_otu_id) == 0 scraper.remove_identical_seqs() d = len(scraper.new_seqs) == 40 e = len(scraper.data.aln) == 5 f = len(scraper.new_seqs_otu_id) == 38 g = 1 for taxon in scraper.data.tre.taxon_namespace: h = taxon.label in scraper.data.otu_dict g = g * h status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') i = status in ('original', 'query') g = g * i # Second test checks that seq len prec is affecting results data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) #reload bc data object is mutable data_obj.workdir = absworkdir scraper2 = PhyscraperScrape(data_obj, ids) scraper2.config.blast_loc = 'remote' scraper2.ids.otu_rank = {} scraper2.config.gifilename = False j = len(scraper2.data.aln) == 5 # scraper2.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper2.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") scraper2.config.seq_len_perc = 0.998 # Change seq len percentage from default of 75% k = len(scraper2.new_seqs) == 40 l = len(scraper2.new_seqs_otu_id) == 0 scraper2.remove_identical_seqs() # print(scraper2.data.otu_dict) # print(len(scraper.new_seqs_otu_id), 38) # print(len(scraper2.new_seqs_otu_id), 36) m = len(scraper2.new_seqs_otu_id) == 36 count = 0 assert a * b * c * d * e * f * g * h * i * j * k * l * m == True
def own_data_run(seqaln, mattype, trfn, schema_trf, workdir, sp_info_jsonfi, configfi, ingroup_mrca=None, shared_blast_folder=None): """This is the wrapper function to start a PhyScraper run with your own data. You need: seqaln = path to sequence alignment file mattype = the format name of you alignment trfn = path to file with the phylogeny to update schema_trf = format type of your phylogeny workdir = define where your analysis files shall be stored sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function (usually, just leave it like it is in the example scripts.). configfi = path to your config file ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here, can be obtained bu running: python scripts/get_ott.py ingroup_name shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: ATT\n") scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb")) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=sp_info_jsonfi, ingroup_mrca=ingroup_mrca) # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up ID dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) scraper = PhyscraperScrape(data_obj, ids) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None # run the analyses scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast_wrapper(delay=14) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() return 1
def test_sp_seq_d(): absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_sp_d = [] sp_d = filteredScrape.make_sp_dict() for key in sp_d: v = sp_d[key] for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2: not_added = ['deleted', 'subsequence,', 'not'] if v2['^physcraper:status'].split(' ')[0] not in not_added: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:accession']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2 or u'^physcraper:status' in v2: if v2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if v2['^physcraper:last_blasted'] != '1800/01/01': if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) elif '^ot:ottTaxonName' in v2: user_sp_d.append(v2['^ot:ottTaxonName']) filteredScrape.make_sp_seq_dict() gi_sp_seq_d = [] ott_sp_seq_d = [] for v in filteredScrape.sp_seq_d.values(): for k in v.keys(): # print(k) if len(k.split('.')) >= 2: # if type(k) == int: gi_sp_seq_d.append(k) else: # if type(k) == str or type(k) == unicode: ott_sp_seq_d.append(k) # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d)) assert len(ott_sp_seq_d) == len(user_sp_d) assert len(gi_sp_seq_d) == len(gi_sp_d)
def test_add_local(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) if os.path.exists(otu_jsonfi_local): otu_json_local = json.load(open(otu_jsonfi_local)) else: otu_json_local = OtuJsonDict(id_to_spn_addseq, ids) json.dump(otu_json_local, open(otu_jsonfi_local, "w")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist if add_local_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data # filteredScrape.unpublished = True filteredScrape.data.unpubl_otu_json = otu_json_local filteredScrape.write_unpubl_blastdb(add_local_seq) # filteredScrape.make_otu_dict_entry_unpubl() filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() test = False for key in filteredScrape.data.otu_dict.keys(): if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys(): if filteredScrape.data.otu_dict[key]['^ncbi:title'] == "unpublished": test = True break assert test == True
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") conf = ConfigObj(configfi, interactive=False) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb")) # scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Mapping identifiers between OpenTree and NCBI requires and identifier dict object # ids = IdDicts(conf, workdir="example") # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() # Mapping identifiers between OpenTree and NCBI requires and identifier dict object if os.path.isfile(conf.id_pickle): sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle)) ids = pickle.load(open(conf.id_pickle, "rb")) else: sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) ids.dump() # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() # scraper.write_otu_info() return scraper