def make_alns_dict(self): """Makes dendropy aln out of dict self.comb_seq for all genes. """ physcraper.debug("make_alns_dict") firstelement = True count = 0 for gene in self.comb_seq.keys(): if count == 0: len1 = len(self.comb_seq[gene].keys()) len2 = len1 count = 1 else: len2 = len(self.comb_seq[gene].keys()) assert len1 == len2 for gene in self.comb_seq.keys(): if firstelement: aln1 = DnaCharacterMatrix.from_dict(self.comb_seq[gene]) firstelement = False self.aln_all[count] = aln1 aln1.write(path="{}/aln_0.fas".format(self.workdir), schema="fasta") else: aln = DnaCharacterMatrix.from_dict( self.comb_seq[gene], taxon_namespace=aln1.taxon_namespace) self.aln_all[count] = aln aln.write(path="{}/aln_{}.fas".format(self.workdir, count), schema="fasta") count += 1
def create_sub_files( alignment_file, dates_file, subtree_file, subtree_dates_file, subfasta_file, new_dates_file, ): dates_dic = read_dates(dates_file) # clean up comments and add dates to end of taxon names with open(subtree_file, "r") as fp: content = fp.read().replace("None", "") content = re.sub("NODE_\d+", "", content) for taxon, date in dates_dic.items(): content = content.replace(taxon, taxon + "_" + date) with open(subtree_dates_file, "w") as fp: fp.write(content) # add dates to end of sequence names sub_aln_dic = {} dna = DnaCharacterMatrix.get(path=alignment_file, schema="fasta") for taxon, date in dates_dic.items(): t = dna.taxon_namespace.get_taxon(label=taxon) new_taxon_name = taxon + "_" + date sub_aln_dic[new_taxon_name] = str(dna[t]) sub_dna = DnaCharacterMatrix.from_dict(sub_aln_dic) sub_dna.write(path=subfasta_file, schema="fasta") with open(new_dates_file, "w") as fp: fp.write(str(len(dates_dic))) for taxon, date in dates_dic.items(): fp.write("\n" + taxon + "_" + date + "\t" + date)
def generate_ATT_from_files(seqaln, mattype, workdir, treefile, otu_json, ingroup_mrca=None): """Build an ATT object without phylesystem. If no ingroup mrca ott_id is provided, will use all taxa in tree to calc mrca.""" aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) for tax in aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") #Forcing all spaces to underscore UGH tre = Tree.get(path=treefile, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) with open(otu_json) as data_file: otu_dict = json.load(data_file) for tax in aln: assert tax.label in otu_dict tre = Tree.get(path=treefile, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) otu_newick = tre.as_string(schema="newick") if ingroup_mrca: ott_mrca = int(ingroup_mrca) else: ott_ids = [otu_dict[otu].get['^ot:ottId'] for otu in otu_dict] ott_mrca = get_mrca_ott(ott_ids) return AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=ott_mrca, workdir=workdir)
def getNucleotides(jsonname, fstname): with open(jsonname+".json") as json: taxa = load(json) nucleotides = DnaCharacterMatrix.get_from_path("../work/"+ fstname+".fst", schema="fasta") for taxon in nucleotides.taxon_set: for accession, metadata in taxa.items(): if accession == taxon.label.split(".")[0]: taxon.label = metadata["name"].replace(" ", "_") return nucleotides
def prepare_phylotorch( subfasta_file, subtree_file, dates_file, json_template_file, json_file, iterations, bito, ): dates = read_dates(dates_file) taxa = [] datess = list(map(float, dates.values())) root_shift = max(datess) - min(datess) for taxon, date in dates.items(): taxa.append({ "id": "{}".format(taxon, date), "type": "Taxon", "attributes": { "date": float(date) }, }) with open(json_template_file, "r") as fp: content = fp.read() content = (content.replace("TAXA_TEMPLATE", json.dumps(taxa)).replace( "ITERATION_TEMPLATE", iterations).replace("ROOT_SHIFT_TEMPLATE", str(root_shift)).replace("DIM_TEMPLATE", str(len(datess) - 2))) if bito.lower() == "true": content = (content.replace( "SEQUENCES_TEMPLATE", '"' + subfasta_file + '"').replace( "TREE_TEMPLATE", '"' + subtree_file + '"').replace( '"newick"', '"file"').replace('"sequences"', '"file"')) else: with open(subtree_file, "r") as fp: newick = fp.read().strip() alignment = DnaCharacterMatrix.get(path=subfasta_file, schema="fasta") sequences = [] for name in alignment: sequences.append({ "taxon": str(name).strip("'"), "sequence": str(alignment[name]) }) content = content.replace("SEQUENCES_TEMPLATE", json.dumps(sequences)).replace( "TREE_TEMPLATE", '"' + newick + '"') with open(json_file, "w") as fp: fp.write(content)
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi): if os.path.isfile("{}/scrape.p".format(workdir)): sys.stdout.write("Readloading from pickled scrapefile") scraper = pickle.load(open("{}/scrape.p".format(workdir),'rb')) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() #read the config file into a configuration object conf = ConfigObj(configfi) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) #Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) #Prune sequnces below a certain length threshold #This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled() #Mapping identifiers between OpenTree and NCBI requires and identifier dict object ids = IdDicts(conf, workdir="example") #Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids, conf) #run the ananlyses scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast() scraper.read_blast() scraper.remove_identical_seqs() scraper.generate_streamed_alignment()
def generate_ATT_from_files(seqaln, mattype, workdir, config_obj, treefile, otu_json, schema_trf, ingroup_mrca=None): """Build an ATT object without phylesystem, use your own files instead. Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in. Note: has test -> test_owndata.py :param seqaln: path to sequence alignment :param mattype: string containing format of sequence alignment :param workdir: path to working directory :param config_obj: config class including the settings :param treefile: path to phylogeny :param otu_json: path to json file containing the translation of tip names to taxon names, generated with OtuJsonDict() :param schema_trf: string defining the format of the input phylogeny :param ingroup_mrca: optional - OToL ID of the mrca of the clade of interest. If no ingroup mrca ott_id is provided, will use all taxa in tree to calc mrca. :return: object of class ATT """ # replace ? in seqaln with - : papara handles them as different characters if not os.path.exists(workdir): os.makedirs(workdir) # use replaced aln as input aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) assert aln.taxon_namespace for tax in aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") # Forcing all spaces to underscore tre = Tree.get(path=treefile, schema=schema_trf, preserve_underscores=True, taxon_namespace=aln.taxon_namespace) assert tre.taxon_namespace is aln.taxon_namespace, "tre and aln have not the same namespace." otu_newick = tre.as_string(schema=schema_trf) otu_dict = json.load(open(otu_json, "r")) if ingroup_mrca: mrca_ott = int(ingroup_mrca) else: ott_ids = [otu_dict[otu].get(u'^ot:ottId', ) for otu in otu_dict] ott_ids = filter(None, ott_ids) ott_ids = set(ott_ids) mrca_ott = get_mrca_ott(ott_ids) return AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=mrca_ott, workdir=workdir, config_obj=config_obj, schema=schema_trf)
def test_0(): if os.path.isfile("tests/data/precooked/otol_scraper.p"): # physcraper.debug(os.getcwd()) conf = physcraper.ConfigObj(configfi, interactive=False) # physcraper.debug("conf") conf.unmapped = 'keep' # physcraper.debug("set unmapped") data_obj = pickle.load( open("tests/data/precooked/otol_tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir # physcraper.debug("dataobj loaded") ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/otol_tiny_gi_map.p", "rb")) # physcraper.debug("ids loaded") scraper = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb")) # physcraper.debug("scraper loaded") # scraper2 = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb")) num_keep = len(scraper.data.aln.taxon_namespace) # physcraper.debug('num_keep') # physcraper.debug(num_keep) # except: else: sys.stdout.write("\n\n No files present\n\n") conf = physcraper.ConfigObj(configfi) conf.unmapped = 'keep' aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem( aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc) # physcraper.debug(len(data_obj.aln.taxon_namespace)) pickle.dump(data_obj, open("tests/data/precooked/otol_tiny_dataobj.p", "wb")) ids = physcraper.IdDicts(conf, workdir=workdir) # physcraper.debug(os.getcwd()) pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/otol_tiny_gi_map.p", "wb")) data_obj.write_files() scraper = physcraper.PhyscraperScrape(data_obj, ids) # physcraper.debug(len(scraper.data.aln.taxon_namespace)) # physcraper.debug("scraper obj made") pickle.dump(scraper.config, open("tests/data/precooked/otol_conf.p", "wb")) pickle.dump(scraper, open("tests/data/precooked/otol_scraper.p", "wb")) num_keep = len(scraper.data.aln.taxon_namespace)
def read_tree_and_alignment(tree, alignment, dated=True, heterochornous=True): tree = read_tree(tree, dated, heterochornous) # alignment seqs_args = dict(schema='nexus', preserve_underscores=True) with open(alignment) as fp: if next(fp).startswith('>'): seqs_args = dict(schema='fasta') dna = DnaCharacterMatrix.get(path=alignment, taxon_namespace=tree.taxon_namespace, **seqs_args) sequence_count = len(dna) if sequence_count != len(dna.taxon_namespace): sys.stderr.write('taxon names in trees and alignment are different') exit(2) return tree, dna
def load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id, tree_id, workdir): """ Generates ATT object from OToL data. :param conf: conf object from physcraper :param ingroup_mrca: mrca of ingroup as OTT ID :param mattype: alignment matrix type :param seqaln: alignment file name :param study_id: OToL study ID :param tree_id: OToL tree ID :param workdir: working directory :return: ATT object """ if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load( open("{}/att_checkpoint.p".format(workdir), "rb")) else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem( aln=aln, workdir=workdir, config_obj=conf, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, # as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() assert isinstance(data_obj, AlignTreeTax) return data_obj
def concatenate_alns(self): """Concatenate all alns into one aln. """ physcraper.debug("concat alns") count = 0 for gene in self.aln_all: if count == 0: aln1 = self.aln_all[gene] aln1.write(path="{}/aln1.fas".format(self.workdir), schema="fasta") count = 1 else: aln2 = self.aln_all[gene] count += 1 aln2.write(path="{}/aln{}.fas".format(self.workdir, count), schema="fasta") assert aln1.taxon_namespace == aln2.taxon_namespace aln1 = DnaCharacterMatrix.concatenate([aln1, aln2]) aln1.write(path="{}/concat.fas".format(self.workdir), schema="fasta") self.concatenated_aln = aln1
def test_generate_ATT_from_phylesystem(): seqaln = "tests/data/input.fas" study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree" configfi = "tests/data/remotencbi.config" sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n") conf = physcraper.ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln, workdir=workdir, config_obj=conf, study_id=study_id, tree_id=tree_id) data_obj == True
def write_labelled(self, label='^ot:ottTaxonName', treepath="labelled.tre", alnpath="labelled.fas"): """output tree and alignement with human readble labels Jumps through abunch of hoops to make labels unique. NOT MEMORY EFFICIENT AT ALL""" assert label in ['^ot:ottTaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"] tmp_newick = self.tre.as_string(schema="newick") tmp_tre = Tree.get(data=tmp_newick, schema="newick", preserve_underscores=True) tmp_fasta = self.aln.as_string(schema="fasta") tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta, schema="fasta", taxon_namespace=tmp_tre.taxon_namespace) new_names = set() for taxon in tmp_tre.taxon_namespace: new_label = self.otu_dict[taxon.label].get(label) if new_label: if new_label in new_names: new_label = " ".join([new_label, taxon.label]) new_names.add(new_label) taxon.label = new_label elif self.otu_dict[taxon.label].get("^ot:originalLabel"): new_label = self.otu_dict[taxon.label].get("^ot:originalLabel") if new_label in new_names: new_label = " ".join([new_label, taxon.label]) new_names.add(new_label) taxon.label = new_label elif self.otu_dict[taxon.label].get("^ncbi:taxon"): new_label = " ".join(["ncbi", str(self.otu_dict[taxon.label].get("^ncbi:taxon"))]) if new_label in new_names: new_label = " ".join([new_label, taxon.label]) new_names.add(new_label) taxon.label = new_label tmp_tre.write(path="{}/{}".format(self.workdir, treepath), schema="newick", unquoted_underscores=True, suppress_edge_lengths=False) tmp_aln.write(path="{}/{}".format(self.workdir, alnpath), schema="fasta")
def test_opentree(): # Use OpenTree phylesystem identifiers to get study and tree study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree" configfi = "tests/data/remotencbi.config" sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n") conf = physcraper.ConfigObj(configfi, interactive=False) # print "1. {}".format(conf.email) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem( aln=aln, workdir=workdir, config_obj=conf, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc) assert isinstance(data_obj, AlignTreeTax)
def align_query_seqs(self, papara_runname="extended"): """runs papara on the tree, the alinment and the new query sequences""" if not self._query_seqs_written: self.write_query_seqs() for filename in glob.glob('{}/papara*'.format(self.workdir)): os.rename(filename, "{}/{}_tmp".format(self.workdir, filename.split("/")[1])) sys.stdout.write("aligning query sequences \n") self.data.write_papara_files() os.chdir(self.workdir)#Clean up dir moving pp = subprocess.call(["papara", "-t", "random_resolve.tre", "-s", "aln_ott.phy", "-q", self.newseqs_file, "-n", papara_runname]) #FIx directory ugliness sys.stdout.write("Papara done") os.chdir('..') assert os.path.exists(path="{}/papara_alignment.{}".format(self.workdir, papara_runname)) self.data.aln = DnaCharacterMatrix.get(path="{}/papara_alignment.{}".format(self.workdir, papara_runname), schema="phylip") self.data.aln.taxon_namespace.is_mutable = False #This should enforce name matching throughout... sys.stdout.write("Papara done") with open(self.logfile, "a") as log: log.write("Following papara alignement, aln has {} seqs \n".format(len(self.data.aln))) self.data.reconcile() self._query_seqs_aligned = 1
ncbi_to_ott = {} fi =open(ott_ncbi) #pickle meeeee for lin in fi: lii= lin.split(",") ncbi_to_ott[int(lii[1])]=int(lii[0]) gi_ncbi_map = {} if os.path.isfile("id_map.txt"): fi = open("id_map.txt") for lin in fi: gi_ncbi_map[int(lin.split(",")[0])]=lin.split(",")[1] orig_seq = DnaCharacterMatrix.get(path="accs",schema="fasta") #prune out identical sequences mapped_taxon_ids=open("id_map.txt","a") stops = [] for taxon, seq in orig_seq.items(): gi = int(taxon.label.split('|')[1]) if gi in gi_ncbi_map.keys(): try: taxon.label = ncbi_to_ott[int(gi_ncbi_map[gi])] except: taxon.label = "ncbi_id_{}".format(int(gi_ncbi_map[gi])) else: try: ncbi_id = int(subprocess.check_output(["bash", get_ncbi_taxonomy, "{}".format(gi), "{}".format(ncbi_dmp)]).split('\t')[1])
def run(arg): taxa = dendropy.TaxonNamespace() tree_format = 'newick' with open(arg.tree.name) as fp: if next(fp).upper().startswith('#NEXUS'): tree_format = 'nexus' tree = Tree.get( file=arg.tree, schema=tree_format, tree_offset=0, taxon_namespace=taxa, preserve_underscores=True, rooting='force-rooted', ) tree.resolve_polytomies(update_bipartitions=True) utils.setup_indexes(tree) oldest = utils.setup_dates(tree, arg.dates, arg.heterochronous) peeling = utils.get_peeling_order(tree) sequence_count = len(tree.taxon_namespace) data = {'peel': peeling, 'S': sequence_count} if arg.input: seqs_args = dict(schema='nexus', preserve_underscores=True) with open(arg.input.name) as fp: if next(fp).startswith('>'): seqs_args = dict(schema='fasta') dna = DnaCharacterMatrix.get(file=arg.input, taxon_namespace=taxa, **seqs_args) alignment_length = dna.sequence_size sequence_count = len(dna) if sequence_count != len(dna.taxon_namespace): sys.stderr.write( 'taxon names in trees and alignment are different') exit(2) print('Number of sequences: {} length {} '.format( sequence_count, alignment_length)) print('Model: ' + arg.model) tipdata, weights = utils.get_dna_leaves_partials_compressed(dna) alignment_length = len(weights) data.update({ 'tipdata': tipdata, 'L': alignment_length, 'weights': weights }) if arg.metadata: # Parse metadata file with open(arg.metadata) as fp: geodata = {} countries = {} geopattern = [] header = next(fp).strip().split('\t') index_country = header.index(arg.metadata_key) for line in fp: row = line.strip().split('\t') if len(row) > 0: geodata[row[0]] = row[index_country] countries[row[index_country]] = 1 country_to_index = {} index_to_country = [] for idx, taxon in enumerate(tree.taxon_namespace): country = geodata[taxon.label] if country not in country_to_index: country_to_index[country] = len(country_to_index) index_to_country.append(country) print('"' + '","'.join(index_to_country) + '"') state_count = len(country_to_index) for idx, taxon in enumerate(tree.taxon_namespace): pattern = [0] * state_count country = geodata[taxon.label] pattern[country_to_index[country]] = 1 geopattern.append(pattern) blens = [None] * (sequence_count * 2 - 1) for node in tree.postorder_node_iter(): blens[node.index - 1] = node.edge.length if node.edge.length < 0: exit(3) children = tree.seed_node.child_nodes() blens[children[0].index] += blens[children[1].index] blens = blens[:-2] # discard root branch and one of its child data['STATES'] = state_count data['blens'] = blens data['frequencies_alpha_geo'] = [1] * state_count data['rates_alpha_geo'] = [1] * int(state_count * (state_count - 1) / 2) data['geodata'] = geopattern if arg.clock is not None: data['map'] = utils.get_preorder(tree) if not arg.estimate_rate: data['rate'] = arg.rate if arg.rate else 1.0 if arg.heterochronous: data['lowers'] = utils.get_lowers(tree) data['lower_root'] = max(oldest, arg.lower_root) else: data['lower_root'] = arg.lower_root else: last = peeling[-1] if last[0] > last[1]: peeling[-1] = [last[1], last[0], last[2]] if arg.categories > 1: data['C'] = arg.categories if arg.invariant: data['C'] += 1 if arg.clock is not None: if arg.coalescent == 'skygrid': data['G'] = arg.grid - 1 data['grid'] = np.linspace(0, arg.cutoff, arg.grid)[1:] elif arg.coalescent == 'skyride': # number of coalescent intervals data['I'] = sequence_count - 1 if arg.model == 'GTR': data['frequencies_alpha'] = [1, 1, 1, 1] data['rates_alpha'] = [1, 1, 1, 1, 1, 1] elif arg.model == 'HKY': data['frequencies_alpha'] = [1, 1, 1, 1] # Samples output file sample_path = arg.output tree_path = sample_path + '.trees' binary = arg.script.replace('.stan', '.pkl') if binary == arg.script: binary = arg.script + '.pkl' if not os.path.lexists(binary) or arg.compile: sm = pystan.StanModel(file=arg.script) with open(binary, 'wb') as f: pickle.dump(sm, f) else: sm = pickle.load(open(binary, 'rb')) stan_args = { 'data': data, 'iter': arg.iter, 'sample_file': sample_path, 'algorithm': arg.algorithm, } if hasattr(arg, 'seed'): stan_args['seed'] = arg.seed if arg.init is not None: inits = {} for line in arg.init: line = line.strip() if len(row) > 0: line = line.split(':') inits[line[0].strip()] = list(map(float, line[1].split(','))) stan_args['init'] = inits elif arg.heights_init or arg.rate is not None: inits = {} if arg.heights_init: ratios, root_height = utils.ratios_root_height_from_branch_lengths( tree) # ratios_unres = np.log(ratios / (1.0 - ratios)) # root_height_unres = np.log(root_height - data['lower_root']) inits['props'] = ratios.tolist() # ratios_unres.tolist() inits['height'] = root_height.item() - data['lower_root'] inits['rate'] = arg.rate elif arg.rate is not None: inits['rate'] = arg.rate stan_args['init'] = inits if arg.algorithm == 'LBFGS': fit = sm.optimizing(**stan_args) print(fit) elif arg.algorithm == 'VB': stan_args['algorithm'] = arg.variational stan_args['output_samples'] = arg.samples if arg.eta: stan_args['eta'] = arg.eta stan_args['adapt_engaged'] = False fit = sm.vb(tol_rel_obj=arg.tol_rel_obj, elbo_samples=arg.elbo_samples, grad_samples=arg.grad_samples, diagnostic_file=sample_path + ".diag", **stan_args) # parse the log file utils.convert_samples_to_nexus(tree, sample_path, tree_path, arg.rate) utils.parse_log(sample_path, 0.05) else: fit = sm.sampling(chains=arg.chains, thin=arg.thin, **stan_args) # chain=1 pystan uses sample_file if arg.chains == 1: if sample_path.endswith('.csv'): tree_path = sample_path.replace('.csv', '.trees') utils.convert_samples_to_nexus(tree, sample_path, tree_path, arg.rate) utils.parse_log(sample_path, 0.05) # chain>1 pystan appends _{chain}.csv to sample_file else: for chain in range(arg.chains): sample_path_chain = sample_path + '_{}.csv'.format(chain) tree_path_chain = sample_path + '_{}.trees'.format(chain) utils.convert_samples_to_nexus(tree, sample_path_chain, tree_path_chain, arg.rate) utils.parse_log(sample_path_chain, 0.05)
def write_labelled(self, label, filename = "labelled", direc='workdir', norepeats=True, add_gb_id=False): """output tree and alignment with human readable labels Jumps through a bunch of hoops to make labels unique. NOT MEMORY EFFICIENT AT ALL Has different options available for different desired outputs :param label: which information shall be displayed in labelled files: possible options: '^ot:ottTaxonName', '^user:TaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon" :param treepath: optional: full file name (including path) for phylogeny :param alnpath: optional: full file name (including path) for alignment :param norepeats: optional: if there shall be no duplicate names in the labelled output files :param add_gb_id: optional, to supplement tiplabel with corresponding GenBank sequence identifier :return: writes out labelled phylogeny and alignment to file """ #debug("write labelled files") if direc == 'workdir': direc = self.workdir treepath = "{}/{}".format(direc, "{}.tre".format(filename)) alnpath = "{}/{}".format(direc, '{}.fas'.format(filename)) debug(treepath) assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"] tmp_newick = self.tre.as_string(schema="newick") tmp_tre = Tree.get(data=tmp_newick, schema="newick", preserve_underscores=True) tmp_fasta = self.aln.as_string(schema="fasta") tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta, schema="fasta", taxon_namespace=tmp_tre.taxon_namespace) new_names = set() for taxon in tmp_tre.taxon_namespace: new_label = self.otu_dict[taxon.label].get(label, None) if new_label is None: if self.otu_dict[taxon.label].get("^ot:originalLabel"): new_label = "orig_{}".format(self.otu_dict[taxon.label]["^ot:originalLabel"]) else: new_label = "ncbi_{}_ottname_{}".format(self.otu_dict[taxon.label].get("^ncbi:taxon", "unk"), self.otu_dict[taxon.label].get('^physcraper:TaxonName', "unk")) new_label = str(new_label).replace(' ', '_') if add_gb_id: gb_id = self.otu_dict[taxon.label].get('^ncbi:accession') if gb_id is None: gb_id = self.otu_dict[taxon.label].get("^ot:originalLabel") new_label = "_".join([new_label, str(gb_id)]) sp_counter = 2 if new_label in new_names and norepeats: new_label = "_".join([new_label, str(sp_counter)]) sp_counter += 1 else: if new_label in new_names and norepeats: new_label = "_".join([new_label, taxon.label]) taxon.label = new_label new_names.add(new_label) tmp_tre.write(path=treepath, schema="newick", unquoted_underscores=True, suppress_edge_lengths=False) tmp_aln.write(path=alnpath, schema="fasta")
import sys from dendropy import DnaCharacterMatrix infi = sys.argv[1] outstub = sys.argv[2] start = int(sys.argv[3]) stop = int(sys.argv[4]) orig = DnaCharacterMatrix.get(path=infi, schema="nexus") d = {} for taxon, seq in orig.items(): d[taxon.label] = seq.values()[start:stop] dna = DnaCharacterMatrix.from_dict(d) dna.write(path="{}.fas".format(outstub), schema="fasta")
import argparse from dendropy.calculate import popgenstat from dendropy import DnaCharacterMatrix if __name__ == "__main__": parser = argparse.ArgumentParser( description="find nucleotide diversity of a population", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--alignment", help="an aligned FASTA file to create a DNA character matrix from") parser.add_argument( "--output", help="outputting a txt file with the nucleotide_diversity value") args = parser.parse_args() d = DnaCharacterMatrix.get(path=args.alignment, schema="fasta") with open(args.output, 'w') as f: f.write(str(popgenstat.nucleotide_diversity(d, ignore_uncertain=True)))
import sys from dendropy import DnaCharacterMatrix infi=sys.argv[1] outstub=sys.argv[2] start=int(sys.argv[3]) stop=int(sys.argv[4]) orig = DnaCharacterMatrix.get(path=infi, schema="nexus") d = {} for taxon, seq in orig.items(): d[taxon.label] = seq.values()[start:stop] dna = DnaCharacterMatrix.from_dict(d) dna.write(path="{}.fas".format(outstub), schema="fasta")
from dendropy import Tree, DnaCharacterMatrix import sys d = {} query_seq = DnaCharacterMatrix.get(path="ascomycota.fasta", schema="fasta") def seq_dict_build(seq, label, seq_dict): new_seq = seq.symbols_as_string().replace("-", "") for tax in seq_dict.keys(): inc_seq = seq_dict[tax].symbols_as_string().replace("-", "") if len(inc_seq) > len(new_seq): if inc_seq.find(new_seq) != -1: sys.stdout.write( "seq {} is subsequence of {}, not added\n".format( label, tax)) return else: if new_seq.find(inc_seq) != -1: del d[tax] d[label] = seq sys.stdout.write( "seq {} is supersequence of {}, {} added and {} removed\n". format(label, tax, label, tax)) return print(".") d[label] = seq return
#! /usr/bin/env python from dendropy import DnaCharacterMatrix, Tree import sys mat = sys.argv[1] mattype = sys.argv[2] tre = sys.argv[3] tretype = sys.argv[4] nam = sys.arg[5] mat = 'example.aln' mattype = 'fasta' tre = 'tree.tre' tretype = 'newick' d = DnaCharacterMatrix.get(path=mat, schema=mattype) # make the taxon_namespace immutable, so the tree does not add # new labels... d.taxon_namespace.is_mutable = False tree = Tree.get(path=tre, schema=tretype, preserve_underscores=True, taxon_namespace=d.taxon_namespace) # get all of the taxa associated with tips of the tree, and make sure that # they include all of the members of the data's taxon_namespace... treed_taxa = [i.taxon for i in tree.leaf_nodes()] if len(treed_taxa) != len(d.taxon_namespace): missing = [i.label for i in d.taxon_namespace if i not in treed_taxa] emf = 'Some of the taxa are not in the tree. Missing "{}"\n' em = emf.format('", "'.join(missing))
def test_reconcile(): #------------------------ seqaln = "tests/data/tiny_test_example/test.fas" seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas" mattype = "fasta" treefile = "tests/data/tiny_test_example/test.tre" treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "example.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = generate_ATT_from_files(seqaln=seqalnmiss, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation" #---------------------------------------------------- data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefilemiss, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation" #---------------------------------------------------- aln = DnaCharacterMatrix.get(path=seqalnmiss, schema=mattype) assert aln.taxon_namespace for tax in aln.taxon_namespace: tax.label = tax.label.replace( " ", "_") # Forcing all spaces to underscore UGH tre = Tree.get(path=treefile, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) assert aln.taxon_namespace == tre.taxon_namespace assert aln.taxon_namespace is tre.taxon_namespace treed_taxa = set() for leaf in tre.leaf_nodes(): treed_taxa.add(leaf.taxon) aln_tax = set() for tax, seq in aln.items(): aln_tax.add(tax) prune = treed_taxa ^ aln_tax assert len(prune) == 1 assert list(prune)[0].label == '2029_doronicum' #---------------- aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) assert aln.taxon_namespace for tax in aln.taxon_namespace: tax.label = tax.label.replace( " ", "_") # Forcing all spaces to underscore UGH tre = Tree.get(path=treefilemiss, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) assert aln.taxon_namespace == tre.taxon_namespace assert aln.taxon_namespace is tre.taxon_namespace treed_taxa = set() for leaf in tre.leaf_nodes(): treed_taxa.add(leaf.taxon) aln_tax = set() for tax, seq in aln.items(): aln_tax.add(tax) prune = treed_taxa ^ aln_tax assert len(prune) == 1 assert list(prune)[0].label == 'S_scopolii' # ---------------------------- seqaln = "tests/data/tiny_test_example/test.fas" seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas" mattype = "fasta" treefile = "tests/data/tiny_test_example/test.tre" treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "example.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir) data_obj = generate_ATT_from_files(seqaln=seqalnmiss, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefilemiss, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation" for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation"
from dendropy import Tree, DnaCharacterMatrix import sys d = {} query_seq = DnaCharacterMatrix.get(path="ascomycota.fasta",schema="fasta") def seq_dict_build(seq, label, seq_dict): new_seq = seq.symbols_as_string().replace("-","") for tax in seq_dict.keys(): inc_seq = seq_dict[tax].symbols_as_string().replace("-","") if len(inc_seq) > len(new_seq): if inc_seq.find(new_seq) != -1: sys.stdout.write("seq {} is subsequence of {}, not added\n".format(label, tax)) return else: if new_seq.find(inc_seq) != -1: del d[tax] d[label] = seq sys.stdout.write("seq {} is supersequence of {}, {} added and {} removed\n".format(label, tax, label, tax)) return print (".") d[label] = seq return for taxon, seq in query_seq.items(): if len(seq.values()) > 800: seq_dict_build(seq, taxon.label, d) else:
def _reconcile_names(self): d = DnaCharacterMatrix.get(path=self.seqaln, schema=self.mattype) d.taxon_namespace.is_mutable = True "so here I need to be getting the original names off of the "
utils.setup_indexes(tree) oldest = utils.setup_dates(tree, _dates, _heterochronous) peeling = utils.get_peeling_order(tree) sequence_count = len(tree.taxon_namespace) data = {'peel': peeling, 'S': sequence_count} if _input: seqs_args = dict(schema='nexus', preserve_underscores=True) with open(_input) as fp: if next(fp).startswith('>'): seqs_args = dict(schema='fasta') dna = DnaCharacterMatrix.get(path=_input, taxon_namespace=taxa, **seqs_args) alignment_length = dna.sequence_size sequence_count = len(dna) if sequence_count != len(dna.taxon_namespace): sys.stderr.write('taxon names in trees and alignment are different') exit(2) print('Number of sequences: {} length {} '.format(sequence_count, alignment_length)) print('Model: ' + _model) tipdata, weights = utils.get_dna_leaves_partials_compressed(dna) alignment_length = len(weights) data.update({
#!/usr/bin/env python from dendropy import DnaCharacterMatrix, Tree import sys mat=sys.argv[1] mattype=sys.argv[2] tre=sys.argv[3] tretype=sys.argv[4] nam=sys.arg[5] mat = 'example.aln' mattype = 'fasta' tre = 'tree.tre' tretype = 'newick' d = DnaCharacterMatrix.get(path=mat, schema=mattype) # make the taxon_namespace immutable, so the tree does not add # new labels... d.taxon_namespace.is_mutable = False tree = Tree.get(path=tre, schema=tretype, preserve_underscores=True, taxon_namespace=d.taxon_namespace) # get all of the taxa associated with tips of the tree, and make sure that # they include all of the members of the data's taxon_namespace... treed_taxa = [i.taxon for i in tree.leaf_nodes()] if len(treed_taxa) != len(d.taxon_namespace): missing = [i.label for i in d.taxon_namespace if i not in treed_taxa] emf = 'Some of the taxa are not in the tree. Missing "{}"\n' em = emf.format('", "'.join(missing))
required=False, type=int, help="""Parameters for Stan script""") arg = parser.parse_args() my_path = os.path.split(os.path.realpath(__file__))[0] taxa = dendropy.TaxonNamespace() trees = dendropy.TreeList.get(file=arg.tree, schema="newick", preserve_underscores=True, tree_offset=0, taxon_namespace=taxa) dna = DnaCharacterMatrix.get(file=arg.input, schema="fasta") alignment_length = dna.sequence_size sequence_count = len(dna) print('Number of sequences: {} length {} '.format(sequence_count, alignment_length)) print('Model: ' + arg.model) tipdata, weights = phylo.get_dna_leaves_partials_compressed(dna) alignment_length = len(weights) for t in trees: t.encode_bipartitions(collapse_unrooted_basal_bifurcation=False) count = 1 bip = {}
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") conf = ConfigObj(configfi, interactive=False) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb")) # scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Mapping identifiers between OpenTree and NCBI requires and identifier dict object # ids = IdDicts(conf, workdir="example") # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() # Mapping identifiers between OpenTree and NCBI requires and identifier dict object if os.path.isfile(conf.id_pickle): sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle)) ids = pickle.load(open(conf.id_pickle, "rb")) else: sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) ids.dump() # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() # scraper.write_otu_info() return scraper
import numpy as np import numpy.linalg as la from dendropy import Tree, DnaCharacterMatrix import myPhylo tree_path = '/home/nehleh/Documents/0_Research/PhD/Data/simulationdata/recombination/exampledataset/exampledataset_RAxML_bestTree' tree = Tree.get_from_path(tree_path, 'newick') alignment = DnaCharacterMatrix.get(file=open( "/home/nehleh/Documents/0_Research/PhD/Data/simulationdata/recombination/exampledataset/wholegenome.fasta" ), schema="fasta") tree2 = Tree.get_from_path( '/home/nehleh/Documents/0_Research/PhD/Data/simulationdata/recombination/exampledataset/RerootTree_node12', 'newick') pi = [0.317, 0.183, 0.367, 0.133] rates = [0.000100, 0.636612, 2.547706, 0.000100, 2.151395] GTR_sample = myPhylo.GTR_model(rates, pi) column = myPhylo.get_DNA_fromAlignment(alignment) dna = column[0] myPhylo.set_index(tree, dna) print("Original tree:::::::::::::::") print(tree.as_string(schema='newick')) print(tree.as_ascii_plot()) LL_normal = myPhylo.computelikelihood(tree, dna, GTR_sample) W_LL_normal = myPhylo.wholeAlignmentLikelihood(tree, alignment, GTR_sample)
ncbi_to_ott = {} fi = open(ott_ncbi) #pickle meeeee for lin in fi: lii = lin.split(",") ncbi_to_ott[int(lii[1])] = int(lii[0]) gi_ncbi_map = {} if os.path.isfile("id_map.txt"): fi = open("id_map.txt") for lin in fi: gi_ncbi_map[int(lin.split(",")[0])] = lin.split(",")[1] orig_seq = DnaCharacterMatrix.get(path="accs", schema="fasta") #prune out identical sequences mapped_taxon_ids = open("id_map.txt", "a") stops = [] for taxon, seq in orig_seq.items(): gi = int(taxon.label.split('|')[1]) if gi in gi_ncbi_map.keys(): try: taxon.label = ncbi_to_ott[int(gi_ncbi_map[gi])] except: taxon.label = "ncbi_id_{}".format(int(gi_ncbi_map[gi])) else: try: ncbi_id = int(
#Use OpenTree phylesystem identifiers to get study and tree study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree" configfi = "tests/data/remotencbi.config" sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n") conf = physcraper.ConfigObj(configfi, interactive=False) print "1. {}".format(conf.email) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id = study_id, tree_id = tree_id, phylesystem_loc = conf.phylesystem_loc) ids = physcraper.IdDicts(conf, workdir=workdir) print "3. {}".format(ids.config.email) data_obj.prune_short()
"""prunes to 1 seq per spp, and fills in missing data for missing spp, in preparation for concanteneation, return dict to be made in char matrix""" aln_dict = {} tmp_dict = {} for taxon, seq in physcraper_obj.aln.items(): aln_dict[taxon.label] = seq seqlen = len(seq) #should all be same bc aligned for spp_name in spp_dict.keys(): try: otu = random.choice(spp_dict[spp_name]) tmp_dict[spp_name] = aln_dict[otu] except KeyError: tmp_dict[spp_name] = "-" * seqlen return tmp_dict aln1 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu1, gene1)) aln2 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu2, gene2), taxon_namespace = aln1.taxon_namespace) concat = DnaCharacterMatrix.concatenate([aln1,aln2]) concat.write(path="concat.fas", schema="fasta") #Open the two pyscraper objects #Merge the alignements on OTT_ID? #How to force/missing data ...