def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi): if os.path.isfile("{}/scrape.p".format(workdir)): sys.stdout.write("Readloading from pickled scrapefile") scraper = pickle.load(open("{}/scrape.p".format(workdir),'rb')) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() #read the config file into a configuration object conf = ConfigObj(configfi) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) #Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) #Prune sequnces below a certain length threshold #This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled() #Mapping identifiers between OpenTree and NCBI requires and identifier dict object ids = IdDicts(conf, workdir="example") #Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids, conf) #run the ananlyses scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment()
def test_0(): if os.path.isfile("tests/data/precooked/otol_scraper.p"): # physcraper.debug(os.getcwd()) conf = physcraper.ConfigObj(configfi, interactive=False) # physcraper.debug("conf") conf.unmapped = 'keep' # physcraper.debug("set unmapped") data_obj = pickle.load( open("tests/data/precooked/otol_tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir # physcraper.debug("dataobj loaded") ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/otol_tiny_gi_map.p", "rb")) # physcraper.debug("ids loaded") scraper = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb")) # physcraper.debug("scraper loaded") # scraper2 = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb")) num_keep = len(scraper.data.aln.taxon_namespace) # physcraper.debug('num_keep') # physcraper.debug(num_keep) # except: else: sys.stdout.write("\n\n No files present\n\n") conf = physcraper.ConfigObj(configfi) conf.unmapped = 'keep' aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem( aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc) # physcraper.debug(len(data_obj.aln.taxon_namespace)) pickle.dump(data_obj, open("tests/data/precooked/otol_tiny_dataobj.p", "wb")) ids = physcraper.IdDicts(conf, workdir=workdir) # physcraper.debug(os.getcwd()) pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/otol_tiny_gi_map.p", "wb")) data_obj.write_files() scraper = physcraper.PhyscraperScrape(data_obj, ids) # physcraper.debug(len(scraper.data.aln.taxon_namespace)) # physcraper.debug("scraper obj made") pickle.dump(scraper.config, open("tests/data/precooked/otol_conf.p", "wb")) pickle.dump(scraper, open("tests/data/precooked/otol_scraper.p", "wb")) num_keep = len(scraper.data.aln.taxon_namespace)
def test_generate_ATT_from_phylesystem(): seqaln = "tests/data/input.fas" study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree" configfi = "tests/data/remotencbi.config" sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n") conf = physcraper.ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln, workdir=workdir, config_obj=conf, study_id=study_id, tree_id=tree_id) data_obj == True
def test_opentree(): # Use OpenTree phylesystem identifiers to get study and tree study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree" configfi = "tests/data/remotencbi.config" sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n") conf = physcraper.ConfigObj(configfi, interactive=False) # print "1. {}".format(conf.email) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem( aln=aln, workdir=workdir, config_obj=conf, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc) assert isinstance(data_obj, AlignTreeTax)
tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree" configfi = "tests/data/remotencbi.config" sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n") conf = physcraper.ConfigObj(configfi, interactive=False) print "1. {}".format(conf.email) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) ids = physcraper.IdDicts(conf, workdir=workdir) print "3. {}".format(ids.config.email) data_obj.prune_short() assert len(data_obj.aln) == 9 data_obj.write_files() try:
aln = dataset.char_matrices[0] #Write it out to file, os we have the 'before' alignment aln.write(path="{}{}.aln".format(study_id, tree_id), schema="nexus") # If we are using an alinment we already wrote to file earlier we can use this #aln = dendropy.DnaCharacterMatrix.get(file=open("{}{}.aln".format(study_id, tree_id)), schema="nexus", taxon_namespace=tre.taxon_namespace) tre.write(path="{}{}.tre".format(study_id, tree_id), schema="nexus") # To preserve taxon labels and relationships, #we will combine the alignement, tree and taxon information into a single data object # By using the OpenTree Phylesystem API we can get the orgininal taxon names as well as the taxon mappings data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln, workdir=workdir, config_obj=conf, study_id=study_id, tree_id=tree_id) #data_obj.write_files() #json.dump(data_obj.otu_dict, open('{}/otu_dict.json'.format(workdir), 'wb')) sys.stdout.write("{} taxa in alignement and tree\n".format(len(data_obj.aln))) # We need to create a physcraper ids object to translate between ncbi and OpenTree identifiers. ids = physcraper.IdDicts(conf, workdir=workdir) # Create an 'scraper' object to get data from NCBI, align it an scraper = physcraper.PhyscraperScrape(data_obj, ids) #scraper.read_blast_wrapper()
from physcraper import get_dataset_from_treebase, generate_ATT_from_phylesystem, ConfigObj, IdDicts, PhyscraperScrape import pickle import sys import os study_id = "pg_873" tree_id = "tree1679" configfi = "tests/data/remotencbi.config" conf = ConfigObj(configfi) dataset = get_dataset_from_treebase(study_id, phylesystem_loc='api') aln = dataset.char_matrices[0] data_obj = generate_ATT_from_phylesystem(aln=aln, workdir='tests/output/treebase', config_obj=conf, study_id=study_id, tree_id=tree_id)
study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype="fasta" configfi = "tests/local.config" conf = ConfigObj(configfi) aln = dendropy.DnaCharacterMatrix.get(file=open(seqaln), schema=mattype) data_obj = generate_ATT_from_phylesystem(aln, "tmp", study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) data_obj.prune_short() data_obj.write_files() data_obj.write_labelled() ids = IdDicts(conf, "tmp") scraper = PhyscraperScrape(data_obj, ids, conf) scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs()
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") conf = ConfigObj(configfi, interactive=False) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb")) # scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Mapping identifiers between OpenTree and NCBI requires and identifier dict object # ids = IdDicts(conf, workdir="example") # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() # Mapping identifiers between OpenTree and NCBI requires and identifier dict object if os.path.isfile(conf.id_pickle): sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle)) ids = pickle.load(open(conf.id_pickle, "rb")) else: sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) ids.dump() # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() # scraper.write_otu_info() return scraper
def filter_OTOL(study_id, tree_id, seqaln, workdir, configfi, threshold, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, # path to local seq id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output.""" debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb')) filteredScrape.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=True) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(seqaln, workdir, study_id, tree_id, phylesystem_loc='api', ingroup_mrca=ingroup_mrca) # Prune sequnces below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.add_setting_to_self(downtorank, threshold) filteredScrape.blacklist = blacklist if add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.unpublished = True filteredScrape.write_unpubl_blastdb(add_unpubl_seq) filteredScrape.run_blast_wrapper(delay=14) filteredScrape.data.local_otu_json = id_to_spn_addseq_json filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False else: sys.stdout.write("BLASTing input sequences\n") filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() sys.stdout.write("calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() while filteredScrape.repeat == 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") sys.stdout.write("BLASTing input sequences\n") filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() filteredScrape.data.prune_short(0.75) sys.stdout.write("calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(downtorank) return filteredScrape