def PS_standard_run(data_obj, ids, shared_blast_folder): """ This is the standard mode for a Physcraper run: update aln and tre as long as new seqs are found, no filtering. :param data_obj: ATT object :param ids: IdDict object :param shared_blast_folder: path to folder for shared blast runs :return: PS run """ if os.path.isfile("{}/scrape_checkpoint.p".format(data_obj.workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") scraper = pickle.load( open("{}/scrape_checkpoint.p".format(data_obj.workdir), 'rb')) scraper.repeat = 1 else: scraper = PhyscraperScrape(data_obj, ids, ingroup_mrca) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.dump("scrape_checkpoint.p") while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.dump() write_out_files(scraper) writeinfofiles.get_additional_GB_info(scraper) return scraper
def test_add_local(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) if os.path.exists(otu_jsonfi_local): otu_json_local = json.load(open(otu_jsonfi_local)) else: otu_json_local = OtuJsonDict(id_to_spn_addseq, ids) json.dump(otu_json_local, open(otu_jsonfi_local, "w")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist if add_local_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data # filteredScrape.unpublished = True filteredScrape.data.unpubl_otu_json = otu_json_local filteredScrape.write_unpubl_blastdb(add_local_seq) # filteredScrape.make_otu_dict_entry_unpubl() filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() test = False for key in filteredScrape.data.otu_dict.keys(): if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys(): if filteredScrape.data.otu_dict[key]['^ncbi:title'] == "unpublished": test = True break assert test == True
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") conf = ConfigObj(configfi, interactive=False) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb")) # scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Mapping identifiers between OpenTree and NCBI requires and identifier dict object # ids = IdDicts(conf, workdir="example") # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() # Mapping identifiers between OpenTree and NCBI requires and identifier dict object if os.path.isfile(conf.id_pickle): sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle)) ids = pickle.load(open(conf.id_pickle, "rb")) else: sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) ids.dump() # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() # scraper.write_otu_info() return scraper
def own_data_run(seqaln, mattype, trfn, schema_trf, workdir, sp_info_jsonfi, configfi, ingroup_mrca=None, shared_blast_folder=None): """This is the wrapper function to start a PhyScraper run with your own data. You need: seqaln = path to sequence alignment file mattype = the format name of you alignment trfn = path to file with the phylogeny to update schema_trf = format type of your phylogeny workdir = define where your analysis files shall be stored sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function (usually, just leave it like it is in the example scripts.). configfi = path to your config file ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here, can be obtained bu running: python scripts/get_ott.py ingroup_name shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: ATT\n") scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb")) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=sp_info_jsonfi, ingroup_mrca=ingroup_mrca) # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up ID dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) scraper = PhyscraperScrape(data_obj, ids) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None # run the analyses scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast_wrapper(delay=14) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() return 1