def get_short_seq_from_concat(self, percentage=0.37): """Finds short sequences, all below a certain threshold will be removed, to avoid having really low coverage in the aln. Default = 0.37. Note percentage is a bit misleading, the cutoff is 37% of the whole concatenated alignment, but the sequences length is calculated without gaps present. The default is so low, as I want to keep taxa that have only a single locus and which is not the longest among the loci within the aln. """ physcraper.debug("get_short_seq_from_concat") seq_len = {} num_tax = 0 for tax, seq in self.concatenated_aln.items(): seq = seq.symbols_as_string().replace("-", "").replace("?", "") seq_len[tax] = len(seq) num_tax += 1 total_len = 0 for tax, seq in self.concatenated_aln.items(): total_len = len(seq) break assert total_len != 0 min_len = total_len * percentage prune_shortest = [] for tax, len_seq in seq_len.items(): if len_seq < min_len: prune_shortest.append(tax) self.short_concat_seq = prune_shortest
def make_sp_gene_dict(self): """Is the build around to make the dicts that are used to make it into a dendropy aln """ physcraper.debug("make_sp_gene_dict") if self.concatfile is not None: self.user_defined_concat() else: sp_to_keep = self.sp_to_keep() self.tmp_dict = deepcopy(self.sp_acc_comb) while len(self.tmp_dict.keys()) >= 1: del_acc = {} for spn in self.tmp_dict.keys(): sp_to_keep_list = sp_to_keep.keys() if spn.replace(" ", "_") in sp_to_keep_list: tmp_gene = deepcopy(self.genes_present) for gene in self.tmp_dict[spn]: tmp_gene.remove(gene) del_acc = self.select_rnd_seq(spn, gene, del_acc) for item in tmp_gene: self.make_empty_seq(spn, item) self.rm_rnd_sp(del_acc) del self.tmp_dict[spn] else: for gene in self.tmp_dict[spn]: del_acc = self.select_rnd_seq(spn, gene, del_acc) self.rm_rnd_sp(del_acc) self.rm_empty_spn_entries(del_acc) self.rename_drop_tips()
def sp_seq_counter(self): """Counts how many seq per sp and genes there are -is used by sp_to_keep. Note: has test :return: builds self.sp_counter """ physcraper.debug("sp_seq_counter") for spn in self.sp_acc_comb: tmp_gene = deepcopy(self.genes_present) for gene in self.sp_acc_comb[spn]: tmp_gene.remove(gene) spn_new = spn.replace(" ", "_") if spn_new in self.sp_counter: self.sp_counter[spn_new][gene] = len( self.sp_acc_comb[spn][gene]) else: self.sp_counter[spn_new] = { gene: len(self.sp_acc_comb[spn][gene]) } for item in tmp_gene: if spn_new in self.sp_counter: self.sp_counter[spn_new][item] = 0 else: self.sp_counter[spn_new] = {item: 0} physcraper.debug(self.sp_counter)
def est_full_tree(self): """Full raxml run from the placement tree as starting tree. """ physcraper.debug("run full tree") cwd = os.getcwd() os.chdir(self.workdir) if os.path.exists("place_resolve.tre"): starting_fn = "place_resolve.tre" else: starting_fn = "starting_red.tre" if os.path.exists("concat_red.fasta.reduced"): aln = "concat_red.fasta.reduced" partition = "partition.reduced" else: aln = "concat_red.fasta" partition = "partition" try: num_threads = int(self.config.num_threads) print(num_threads) subprocess.call([ "raxmlHPC-PTHREADS", "-T", "{}".format(num_threads), "-m", "GTRCAT", "-s", aln, "--print-identical-sequences", "-t", "{}".format(starting_fn), "-p", "1", "-q", partition, "-n", "concat" ]) except: subprocess.call([ "raxmlHPC", "-m", "GTRCAT", "-s", aln, "--print-identical-sequences", "-t", "{}".format(starting_fn), "-p", "1", "-q", partition, "-n", "concat" ]) os.chdir(cwd)
def make_alns_dict(self): """Makes dendropy aln out of dict self.comb_seq for all genes. """ physcraper.debug("make_alns_dict") firstelement = True count = 0 for gene in self.comb_seq.keys(): if count == 0: len1 = len(self.comb_seq[gene].keys()) len2 = len1 count = 1 else: len2 = len(self.comb_seq[gene].keys()) assert len1 == len2 for gene in self.comb_seq.keys(): if firstelement: aln1 = DnaCharacterMatrix.from_dict(self.comb_seq[gene]) firstelement = False self.aln_all[count] = aln1 aln1.write(path="{}/aln_0.fas".format(self.workdir), schema="fasta") else: aln = DnaCharacterMatrix.from_dict( self.comb_seq[gene], taxon_namespace=aln1.taxon_namespace) self.aln_all[count] = aln aln.write(path="{}/aln_{}.fas".format(self.workdir, count), schema="fasta") count += 1
def select_rnd_seq(self, spn, gene, del_acc): """Select a random seq from spn and gene to combine it with a random other one from another gene, but same spn. Is used if the user does not give a concatenation input file. Note: has test :param spn: taxon name :param gene: gene name :param del_acc: dictionary that contains gene name: dict(spn: concat_id of random seq) :return: del_acc """ physcraper.debug("select_rnd_seq") count = 2 random_gen = random.choice(list(self.tmp_dict[spn][gene])) self.sp_acc_comb[spn][gene][random_gen][ "concat:status"] = "used in concat" seq = str(self.tmp_dict[spn][gene][random_gen]["seq"]) spn_ = spn.replace(" ", "_") spn_ = spn_.replace(".", "").replace("'", "") if gene in self.comb_seq.keys(): if spn_ not in self.comb_seq[gene].keys(): self.comb_seq[gene][spn_] = seq if gene in self.comb_acc: self.comb_acc[gene][spn_] = random_gen else: self.comb_acc[gene] = {spn_: random_gen} if gene in del_acc.keys(): if spn_ not in del_acc[gene].keys(): del_acc[gene][spn] = random_gen else: del_acc[gene] = {spn: random_gen} else: spn_new = "{}_{}".format(spn_, count) while spn_new in self.comb_seq[gene].keys(): count += 1 spn_new = "{}_{}".format(spn_, count) self.comb_seq[gene][spn_new] = seq self.comb_acc[gene][spn_new] = random_gen self.sp_acc_comb[spn][gene][random_gen][ "new tipname"] = spn_new if gene in del_acc.keys(): if spn_ not in del_acc[gene].keys(): del_acc[gene][spn] = random_gen else: del_acc[gene][spn] = random_gen else: del_acc[gene] = {spn: random_gen} else: self.comb_seq[gene] = {spn_: seq} self.comb_acc[gene] = {spn_: random_gen} if gene in del_acc.keys(): if spn_ not in del_acc[gene].keys(): del_acc[gene][spn] = random_gen else: del_acc[gene] = {spn: random_gen} else: del_acc[gene] = {spn: random_gen} self.otu_to_spn(spn, gene, del_acc[gene][spn]) return del_acc
def rm_rnd_sp(self, del_acc): """Removes the random selected seq from the tmp_dict, so that it cannot be selected again. """ physcraper.debug("rm_rnd sp") for spn2 in self.tmp_dict: for gene2 in self.tmp_dict[spn2]: if gene2 in del_acc: if spn2 in del_acc[gene2]: key = del_acc[gene2][spn2] if key in self.tmp_dict[spn2][gene2]: del self.tmp_dict[spn2][gene2][key]
def combine(self): """Combines several PhyScraper objects to make a concatenated run dict. Is a wrapper function around make_concat_id_dict(). It produces the parameters needed for the function. """ physcraper.debug("combine") self.num_of_genes = len(self.single_runs) concat_id_counter = 1 for genename in self.single_runs: self.genes_present.append(genename) for otu in self.single_runs[genename].data.aln.taxon_namespace: concat_id = "concat_{}".format(concat_id_counter) self.make_concat_id_dict(otu.label, genename, concat_id) concat_id_counter += 1 return
def load_single_genes(self, workdir, pickle_fn, genename): """Load PhyScraper class objects and make a single dict per run. Removes abandoned nodes first. :param workdir: directory of single gene run :param pickle_fn: path to pickled file of the Physcraper run :param genename: string, name for locus provided by user :return: self.single_runs """ physcraper.debug("load_single_genes: {}".format(genename)) scrape = pickle.load(open("{}/{}".format(workdir, pickle_fn), "rb")) scrape = remove_aln_tre_leaf(scrape) self.single_runs[genename] = deepcopy(scrape) return
def rename_drop_tips(self): """ Removes tips from tre as start that are not present in the concatenated aln and renames tips that are present. """ physcraper.debug("rename_drop_tips") # leaf.taxon is never in concat_tips for leaf in self.tre_as_start.leaf_nodes(): if leaf.taxon.label not in self.concat_tips.keys(): self.tre_as_start.prune_taxa([leaf]) self.tre_as_start.prune_taxa_with_labels([leaf.label]) self.tre_as_start.prune_taxa_with_labels([leaf]) self.tre_as_start.prune_taxa_with_labels([leaf.taxon.label]) self.tre_as_start.taxon_namespace.remove_taxon_label( leaf.taxon.label) else: for otu in self.concat_tips.keys(): if otu == leaf.taxon.label: leaf.taxon.label = self.concat_tips[otu]
def rm_empty_spn_entries(self, del_acc): """Removes keys from tmp dict, if the key/sp has no value anymore. Helper function. """ physcraper.debug("rm_empty_spn_entries") del_sp = None for spn2 in self.tmp_dict: for gene2 in self.tmp_dict[spn2]: if gene2 in del_acc: if spn2 in del_acc[gene2]: if len(self.tmp_dict[spn2][gene2]) == 0: del_sp = spn2 if del_sp is not None: for item in self.sp_acc_comb[del_sp]: for otu in self.sp_acc_comb[del_sp][item]: if self.sp_acc_comb[del_sp][item][otu][ "concat:status"] != "used in concat": self.sp_acc_comb[del_sp][item][otu][ "concat:status"] = "deleted, because not enough seq are present" del self.tmp_dict[del_sp]
def filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, id_to_spn, configfi, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output. """ license_print() debug("Debugging mode is on") print(workdir) print(os.path.exists(workdir)) if not os.path.exists(workdir): print("make wd") os.makedirs(workdir) conf = ConfigObj(configfi) ids = load_ids_obj(conf, workdir) make_otujsondict(id_to_spn, workdir, ids) # make json file for unpublished database if add_unpubl_seq is not None: make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True) # Generate an linked Alignment-Tree-Taxa object data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca) filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj, downtorank, id_to_spn_addseq_json, ids, selectby, shared_blast_folder, threshold, ingroup_mrca) save_copy_code(workdir) return filteredScrape
def filter_OTOL( study_id, tree_id, seqaln, mattype, workdir, configfi, threshold, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, # path to local seq id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output using data from OToL.""" license_print() debug("Debugging mode is on") if not os.path.exists(workdir): os.makedirs(workdir) # read the config file into a configuration object conf = ConfigObj(configfi) # Generate an linked Alignment-Tree-Taxa object data_obj = load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id, tree_id, workdir) ids = load_ids_obj(conf, workdir) # make json file for unpublished database if add_unpubl_seq is not None: make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj, downtorank, id_to_spn_addseq_json, ids, selectby, shared_blast_folder, threshold, ingroup_mrca) save_copy_code(workdir) return filteredScrape
def concatenate_alns(self): """Concatenate all alns into one aln. """ physcraper.debug("concat alns") count = 0 for gene in self.aln_all: if count == 0: aln1 = self.aln_all[gene] aln1.write(path="{}/aln1.fas".format(self.workdir), schema="fasta") count = 1 else: aln2 = self.aln_all[gene] count += 1 aln2.write(path="{}/aln{}.fas".format(self.workdir, count), schema="fasta") assert aln1.taxon_namespace == aln2.taxon_namespace aln1 = DnaCharacterMatrix.concatenate([aln1, aln2]) aln1.write(path="{}/concat.fas".format(self.workdir), schema="fasta") self.concatenated_aln = aln1
def sp_to_keep(self): """Uses the sp_counter to make a list of sp that should be kept in concatenated alignment, because they are the only representative of the sp. Note: has test :return: dictionary with taxon name and number saying how many genes are missing """ physcraper.debug("sp to keep") sp_to_keep = {} for spn in self.sp_counter: seq_counter = True not_present = 0 for gene in self.sp_counter[spn]: if self.sp_counter[spn][gene] == 0: seq_counter = False not_present += 1 if not seq_counter: sp_to_keep[spn] = not_present # physcraper.debug(sp_to_keep) return sp_to_keep
def own_data_run(seqaln, mattype, trfn, schema_trf, workdir, id_to_spn, configfi, ingroup_mrca=None, shared_blast_folder=None): """This is the wrapper function to start a PhyScraper standard run with your own data. You need: seqaln = path to sequence alignment file mattype = the format name of you alignment trfn = path to file with the phylogeny to update schema_trf = format type of your phylogeny workdir = define where your analysis files shall be stored sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function (usually, just leave it like it is in the example scripts.). configfi = path to your config file ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here, can be obtained bu running: python scripts/get_ott.py ingroup_name shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ license_print() debug("Debugging mode is on") if not os.path.exists(workdir): os.mkdir(workdir) conf = ConfigObj(configfi) ids = load_ids_obj(conf, workdir) make_otujsondict(id_to_spn, workdir, ids) data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca) # Mapping identifiers between original data and NCBI requires an identifier dict object # scraper = PhyscraperScrape(data_obj, ids) scraper = PS_standard_run(data_obj, ids, shared_blast_folder) save_copy_code(workdir) return 1
def remove_short_seq(self): """Removes short seq that were found with get_short_seq and write it to file. """ physcraper.debug("remove_short_seq") self.concatenated_aln.remove_sequences(self.short_concat_seq) for leaf in self.tre_as_start.leaf_nodes(): for tax in self.short_concat_seq: if tax.label == leaf.taxon.label.replace(" ", "_"): self.tre_as_start.prune_taxa([leaf]) self.tre_as_start.prune_taxa_with_labels([leaf.label]) self.tre_as_start.prune_taxa_with_labels([leaf]) self.tre_as_start.prune_taxa_with_labels( [leaf.taxon.label]) self.tre_as_start.taxon_namespace.remove_taxon_label( leaf.taxon.label) else: leaf.taxon.label = leaf.taxon.label.replace(" ", "_") tre_as_start_str = self.tre_as_start.as_string( schema="newick", # preserve_underscores=True, unquoted_underscores=True, suppress_rooting=True) fi = open("{}/{}".format(self.workdir, "starting_red.tre"), "w") fi.write(tre_as_start_str) fi.close() for tax in self.concatenated_aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") self.concatenated_aln.write(path="{}/{}".format( self.workdir, "concat_red.fasta"), schema="fasta") tre_ids = set() for tax in self.tre_as_start.taxon_namespace: tre_ids.add(tax.label) aln_ids = set() for tax in self.concatenated_aln.taxon_namespace: aln_ids.add(tax.label)
def write_partition(self): """Write the partitioning file for RAxML. """ physcraper.debug("write_partition") count = 0 len_gene = 0 for gene in self.single_runs: for tax, seq in self.single_runs[gene].data.aln.items(): len_gene = len(seq.symbols_as_string()) break if count == 0: with open("{}/partition".format(self.workdir), "w") as partition: partition.write("DNA, {} = 1-{}\n".format(gene, len_gene)) self.part_len = len_gene count = 1 else: start = self.part_len + 1 end = self.part_len + len_gene self.part_len = self.part_len + len_gene with open("{}/partition".format(self.workdir), "a") as partition: partition.write("DNA, {} = {}-{}\n".format( gene, start, end))
def get_largest_tre(self): """Find the single gene tree with the most tips, which will be used as starting tree for concat phylo reconstruction. """ physcraper.debug("get_largest_tre") first = True len_all_taxa = {} for gene in self.single_runs: len_aln_taxa = len(self.single_runs[gene].data.aln.taxon_namespace) len_all_taxa[gene] = len_aln_taxa len_max = 0 gene_max = 0 for gene, len_item in len_all_taxa.items(): if first: len_max = len_item gene_max = gene assert len_max != 0 assert gene_max != 0 first = False if len_item > len_max: len_max = len_item gene_max = gene self.tre_as_start = self.single_runs[gene_max].data.tre self.tre_start_gene = gene_max
def make_concat_id_dict(self, otu, genename, concat_id): """Makes a concat_id entry with all information Note: has test :param otu: otu_id :param genename: name of single gene run :param concat_id: unique identifier in the concat class :return: modified self.sp_acc_comb """ data = self.single_runs[genename].data.otu_dict[otu] seq = str(self.single_runs[genename].data.aln[otu]) spn = None if "^ot:ottTaxonName" in data: spn = self.get_taxon_info("^ot:ottTaxonName", data) if spn not in self.sp_acc_comb: self.sp_acc_comb[spn] = {} if genename not in self.sp_acc_comb[spn]: self.sp_acc_comb[spn][genename] = {} elif "^user:TaxonName" in data: spn = self.get_taxon_info("^user:TaxonName", data) if spn not in self.sp_acc_comb: self.sp_acc_comb[spn] = {} if genename not in self.sp_acc_comb[spn]: self.sp_acc_comb[spn][genename] = {} else: # we should never get here.... physcraper.debug("THERE IS A SERIOUS PROBLEM....") assert spn is not None if concat_id not in self.sp_acc_comb[spn][genename]: if "^ncbi:accession" in data: unique_id = data["^ncbi:accession"] elif u"^ot:originalLabel" in data: unique_id = data[u"^ot:originalLabel"] concat_dict = { "unique_id": unique_id, "seq": seq, "spn": spn, "original_PS_id": otu, "concat:status": "single run", } self.sp_acc_comb[spn][genename][concat_id] = concat_dict else: physcraper.debug( "something goes wrong, you should not try to add the same id several times...." ) if concat_dict["spn"] is None: # we should never get here.... sys.stderr.write( "There is no species name for the seq. Do not know how to concatenate then. " "Please remove seq from aln: {}.".format( data["^ncbi:accession"])) physcraper.debug("THERE IS A SERIOUS PROBLEM....spn is none") spn = self.get_taxon_info("^ot:ottTaxonName", data) self.sp_acc_comb[spn] = self.sp_acc_comb[unique_id] del self.sp_acc_comb[unique_id]
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ license_print() debug("Debugging mode is on") if not os.path.exists(workdir): os.mkdir(workdir) conf = ConfigObj(configfi) data_obj = load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id, tree_id, workdir) # Mapping identifiers between OpenTree and NCBI requires an identifier dict object ids = load_ids_obj(conf, workdir, ingroup_mrca) # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PS_standard_run(data_obj, ids, shared_blast_folder) save_copy_code(workdir) return scraper
def place_new_seqs(self): """Places the new seqs (that are only found in loci which is not the starting tree) onto one of the single run trees. """ physcraper.debug("place_new_seqs") if len(self.concatenated_aln.taxon_namespace) - len( self.short_concat_seq) > len(self.tre_as_start.leaf_nodes()): if os.path.exists("RAxML_labelledTree.PLACE"): os.rename("RAxML_labelledTree.PLACE", "RAxML_labelledTreePLACE.tmp") cwd = os.getcwd() os.chdir(self.workdir) physcraper.debug("make place-tree") try: num_threads = int(self.config.num_threads) print(num_threads) subprocess.call([ "raxmlHPC-PTHREADS", "-T", "{}".format(num_threads), "-m", "GTRCAT", "-f", "v", "-q", "partition", "-s", "concat_red.fasta", "-t", "starting_red.tre", "-n", "PLACE" ]) except: subprocess.call([ "raxmlHPC", "-m", "GTRCAT", "-f", "v", "-q", "partition", "-s", "concat_red.fasta", "-t", "starting_red.tre", "-n", "PLACE" ]) os.chdir(cwd) physcraper.debug("read place tree") placetre = Tree.get(path="{}/starting_red.tre".format( self.workdir), schema="newick", preserve_underscores=True, suppress_internal_node_taxa=True, suppress_leaf_node_taxa=True) physcraper.debug("resolve polytomies") placetre.resolve_polytomies() placetre.write(path="{}/place_resolve.tre".format(self.workdir), schema="newick", unquoted_underscores=True)
def filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, spInfoDict, configfi, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output. """ debug("Debugging mode is on") # debug(shared_blast_folder) # debug(some) # if _DEBUG_MK == 1: # random.seed(3269235691) print(workdir) if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb')) filteredScrape.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=True) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=spInfoDict, ingroup_mrca=ingroup_mrca) # Prune sequnces below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.add_setting_to_self(downtorank, threshold) filteredScrape.blacklist = blacklist if add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.unpublished = True filteredScrape.write_unpubl_blastdb(add_unpubl_seq) filteredScrape.run_blast_wrapper(delay=14) print("add unpubl otu json") filteredScrape.data.unpubl_otu_json = id_to_spn_addseq_json print(filteredScrape.data.unpubl_otu_json) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False else: # run the analysis sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() sys.stdout.write("Calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.data.write_otus("otu_info", schema="table") filteredScrape.write_otu_info(downtorank) filteredScrape.dump() while filteredScrape.repeat == 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() filteredScrape.data.prune_short(0.75) sys.stdout.write("calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(downtorank) # print(some) filteredScrape.write_otu_info(downtorank) return filteredScrape
def add_different_rank(seqaln, mattype, trfn, schema_trf, workdir, threshold, id_to_spn, new_confifi, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None, backbone=False): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output. """ license_print() debug("Debugging mode is on") dump_fn = "add_different_rank{}_{}.run".format(ingroup_mrca, downtorank) # if files does not exists, this loop was not yet run, if exitsts, go to next if os.path.isfile("{}/{}".format(workdir, dump_fn)): filteredScrape = pickle.load( open("{}/scrape_checkpoint.p".format(workdir), 'rb')) else: assert os.path.isfile("{}/scrape_checkpoint.p".format(workdir)) sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load( open("{}/scrape_checkpoint.p".format(workdir), 'rb')) # copy previous files to different folder count = 1 while os.path.exists("{}/update_{}".format(workdir, count)): count += 1 os.mkdir("{}/update_{}".format(workdir, count)) old_runs = "{}/update_{}".format(workdir, count) src_files = os.listdir(workdir) for file_name in src_files: full_file_name = os.path.join(workdir, file_name) if os.path.isfile(full_file_name): shutil.copy(full_file_name, old_runs) filteredScrape.repeat = 1 conf = ConfigObj(new_confifi) # add new config assert filteredScrape.config != conf filteredScrape.config = conf assert filteredScrape.config == conf # set new ingroup_mrca filteredScrape.data.mrca_ott = ingroup_mrca filteredScrape.mrca_ncbi = filteredScrape.ids.ott_to_ncbi[ filteredScrape.data.ott_mrca] assert filteredScrape.data.ott_mrca == ingroup_mrca with open(filteredScrape.logfile, "a") as log: log.write( "You run 'add_different_rank' with the following settings: rank: {} and ingroup_mrca: {}. \n" .format(downtorank, ingroup_mrca)) # here the filter standard function continues... if backbone is True: filteredScrape.backbone = backbone filteredScrape.data.write_files(treepath="backbone.tre", alnpath="backbone.fas") else: filteredScrape.backbone = False # set new downtorank and numbers: filteredScrape.add_setting_to_self(downtorank, threshold) filteredScrape.blacklist = blacklist if add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.data.unpubl_otu_json = json.load( open("{}/otu_dict_localseq.json".format(workdir))) filteredScrape.write_unpubl_blastdb(add_unpubl_seq) filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False if backbone: filteredScrape.repeat = 1 else: sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() sys.stdout.write("Filter the sequences\n") if threshold is not None: if len(filteredScrape.new_seqs_otu_id) > 0: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(selectby=selectby) filteredScrape.replace_new_seq() sys.stdout.write("Calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.data.write_otus("otu_info", schema="table") write_out_files(filteredScrape, downtorank) if backbone: filteredScrape.repeat = 0 # set back to normal - only used to reassess formerly discarded seq in first round while filteredScrape.repeat == 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() sys.stdout.write("Filter the sequences\n") if threshold is not None: if len(filteredScrape.new_seqs_otu_id) > 0: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(selectby=selectby) filteredScrape.replace_new_seq() filteredScrape.data.prune_short() sys.stdout.write("calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() write_out_files(filteredScrape, downtorank) if backbone: filteredScrape.repeat = 0 writeinfofiles.get_additional_GB_info(filteredScrape) filteredScrape.dump() dump_fn = "add_different_rank{}_{}.run".format(ingroup_mrca, downtorank) fn = open(dump_fn, "w") fn.write( "add different rank with following settings {} and {} finished".format( ingroup_mrca, downtorank)) fn.close() return filteredScrape
def test_blacklist(): workdir = "tests/output/test_blacklist" configfi = "tests/data/test.config" # make one run without blacklist debug("run without blacklist") blacklist = None noblack = os.path.join(workdir, "noblacklist") absworkdir = os.path.abspath(noblack) if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) noblackScrape = FilterBlast(data_obj, ids) noblackScrape._blasted = 1 src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") # print(dest) full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) noblackScrape.read_blast_wrapper() noblackScrape.remove_identical_seqs() noblackScrape.generate_streamed_alignment() # one run with blacklist debug("run with blacklist") blacklist = ['JX895340.1'] absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape.blacklist = blacklist filteredScrape._blasted = 1 if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() print("RUN TESTS!") gi_l = [] gi_l_2 = [] for tax in filteredScrape.data.tre.taxon_namespace: gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l.append(gi_id) print(gi_l) for tax in noblackScrape.data.tre.taxon_namespace: # print(filteredScrape.data.otu_dict[tax.label]) gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l_2.append(gi_id) print(gi_l_2) for item in blacklist: assert item not in gi_l print("RUN TESTS2!") assert item in gi_l_2 # # print("seq was not added in blacklist run") # print("inbetween step works") # test if it removes blacklist gi from already added aln: print("run with later blacklist") # else: # print("blacklist gi was added in previous run") # print("now we want to remove it.") len_before = (len(noblackScrape.data.tre.taxon_namespace)) noblackScrape.blacklist = blacklist noblackScrape.generate_streamed_alignment() assert len_before - 1 == len(noblackScrape.data.tre.taxon_namespace)
def run_with_settings(settings): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output.""" debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(settings.workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load( open("{}/scrape_checkpoint.p".format(settings.workdir), "rb") ) filteredScrape.repeat = 1 else: conf = ConfigObj(settings.configfi) # print("config") debug(dir(conf)) debug(conf.email) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=settings.seqaln, mattype=settings.mattype, workdir=settings.workdir, treefile=settings.trfn, schema_trf=settings.schema_trf, otu_json=settings.spInfoDict, ingroup_mrca=None) # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() ids = IdDicts(conf, workdir=settings.workdir) filteredScrape = FilterBlast(data_obj, ids, settings) filteredScrape.add_setting_to_self(settings.downtorank, settings.threshold) filteredScrape.write_otu_info(settings.downtorank) if settings.add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.write_unpubl_blastdb(settings.add_unpubl_seq) filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.local_otu_json = settings.id_to_spn_addseq_json filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False # run the ananlyses if filteredScrape.unpublished is not True: filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() if settings.threshold is not None: filteredScrape.sp_dict(settings.downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby) filteredScrape.replace_new_seq() debug("from replace to streamed aln") filteredScrape.generate_streamed_alignment() filteredScrape.dump() while filteredScrape.repeat is 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder) filteredScrape.remove_identical_seqs() if settings.threshold is not None: filteredScrape.sp_dict(settings.downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby) filteredScrape.replace_new_seq() filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(settings.downtorank) return filteredScrape
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") conf = ConfigObj(configfi, interactive=False) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb")) # scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Mapping identifiers between OpenTree and NCBI requires and identifier dict object # ids = IdDicts(conf, workdir="example") # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() # Mapping identifiers between OpenTree and NCBI requires and identifier dict object if os.path.isfile(conf.id_pickle): sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle)) ids = pickle.load(open(conf.id_pickle, "rb")) else: sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) ids.dump() # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() # scraper.write_otu_info() return scraper
def own_data_run(seqaln, mattype, trfn, schema_trf, workdir, sp_info_jsonfi, configfi, ingroup_mrca=None, shared_blast_folder=None): """This is the wrapper function to start a PhyScraper run with your own data. You need: seqaln = path to sequence alignment file mattype = the format name of you alignment trfn = path to file with the phylogeny to update schema_trf = format type of your phylogeny workdir = define where your analysis files shall be stored sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function (usually, just leave it like it is in the example scripts.). configfi = path to your config file ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here, can be obtained bu running: python scripts/get_ott.py ingroup_name shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: ATT\n") scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb")) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=sp_info_jsonfi, ingroup_mrca=ingroup_mrca) # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up ID dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) scraper = PhyscraperScrape(data_obj, ids) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None # run the analyses scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast_wrapper(delay=14) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() return 1
def user_defined_concat(self): """If a user gave an input file to concatenate data. Fills in the data for self.comb_seq, self.comb_acc (is the replacement function for select_rnd_seq). """ physcraper.debug("user_defined_concat") with open("{}/{}".format(self.workdir, self.concatfile), mode="r") as infile: reader = csv.reader(infile) sp_concat = dict((rows[0], rows[1]) for rows in reader) for otu in sp_concat.keys(): global_spn = None concat_l = sp_concat[otu] if concat_l[:1] == "[": concat_l = concat_l[1:-1] concat_l = concat_l.split(", ") for item in concat_l: gene_l = [] if item[:1] == "'": item = item[1:-1] item = item.encode("utf-8") for gene in self.single_runs: spn = None for key, val in self.single_runs[gene].data.otu_dict.items( ): if item.isdigit(): if "^ncbi:accession" in val: if item == val["^ncbi:accession"]: spn = val["^ot:ottTaxonName"] gene_l.append(gene) else: if "^ncbi:accession" in val: if item == val["^ncbi:accession"]: spn = val["^ot:ottTaxonName"] gene_l.append(gene) elif u"^ot:originalLabel" in val: if item == val[u"^ot:originalLabel"]: spn = val["^ot:ottTaxonName"] gene_l.append(gene) if spn is not None: global_spn = spn.replace(".", "").replace("'", "") spn = spn.replace(".", "").replace("'", "") for key2, val2 in self.sp_acc_comb[spn][ gene].items(): cond = False if len(item.split(".") ) >= 2 and val2["unique_id"] == item: cond = True else: if val2["unique_id"] == item: cond = True if cond: concat_id = key2 self.sp_acc_comb[spn][gene][concat_id][ "concat:status"] = "used in concat" seq = str(self.sp_acc_comb[spn][gene] [concat_id]["seq"]) otu_ = otu.replace(" ", "_") otu_ = otu_.replace(".", "").replace("'", "") if gene in self.comb_seq.keys(): if otu_ not in self.comb_seq[ gene].keys(): self.comb_seq[gene][otu_] = seq if gene in self.comb_acc: self.comb_acc[gene][ otu_] = concat_id else: self.comb_acc[gene] = { otu_: concat_id } else: self.comb_seq[gene][otu_] = seq self.comb_acc[gene][ otu_] = concat_id else: self.comb_seq[gene] = {otu_: seq} self.comb_acc[gene] = {otu_: concat_id} if spn != otu: self.sp_acc_comb[spn][gene][concat_id][ "new tipname"] = otu_ self.otu_to_spn(spn, gene, concat_id) break if spn is not None: break if len(gene_l) == len(concat_l): missing_gene = [ item for item in self.genes_present if item not in gene_l ] for genes in missing_gene: self.make_empty_seq(global_spn, genes)