def execute_rscript(self, script): cmd = [] cmd.append("Rscript") cmd.append("--vanilla") cmd.append("-") p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(p, script) if p.returncode != 0: print(script) for row in stderr.split("\n"): print("# {}".format(row)) sys.exit(p.returncode) results = {} num_lines_with_results = 0 for line in stdout.split("\n"): if not line.startswith(Rcalculator.RESULT_FLAG_LEADER): continue parts = line[len(Rcalculator.RESULT_FLAG_LEADER) :].split("=") assert len(parts) == 2 key = parts[0].strip() try: value = float(parts[1].strip()) except ValueError as e: value = "NA" results[key] = value num_lines_with_results += 1 return results
def prune_taxa_from_trees(trees, taxa, paup_path='paup'): """ Drops Taxon objects given in container ``taxa`` from TreeList ``trees`` """ tf = tempfile.NamedTemporaryFile("w", delete=True) trees.write_to_stream(tf, schema='nexus') tf.flush() output_tree_file_handle = tempfile.NamedTemporaryFile("w+", delete=True) output_tree_filepath = output_tree_file_handle.name tax_idxs = [ str(trees.taxon_namespace.index(t)+1) for t in taxa ] tax_idxs = " ".join(tax_idxs) paup_template = """\ set warnreset=no; exe %s; gett file=%s storebrlens=yes; delete %s / prune; savetrees file=%s format=nexus brlens=user taxablk=yes maxdecimals=20; """ % (tf.name, tf.name, tax_idxs, output_tree_filepath) paup_run = subprocess.Popen(['%s -n' % paup_path], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = processio.communicate(paup_run, paup_template) t = dendropy.TreeList.get_from_path(output_tree_filepath, "nexus", taxon_namespace=trees.taxon_namespace) output_tree_file_handle.close() return t
def generate( self, trees, dataset=None, taxon_namespace=None, input_sequences=None, **kwargs): args=self._compose_arguments() # with open("x.txt", "w") as inputf: with self.get_tempfile() as inputf: if input_sequences is not None: input_sequences.write_to_stream(inputf, schema="phylip",) inputf.write("{}\n".format(len(trees))) trees.write_to_stream(inputf, "newick", suppress_rooting=True, suppress_internal_node_labels=True) inputf.flush() args.append(inputf.name) # print("seq-gen args: = %s" % " ".join(args)) run = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(run) if stderr or run.returncode != 0: raise RuntimeError("Seq-gen error: %s" % stderr) if taxon_namespace is None: taxon_namespace = trees.taxon_namespace if dataset is None: dataset = dendropy.DataSet(**kwargs) if taxon_namespace is not None: dataset.attach_taxon_namespace(taxon_namespace) dataset.read(data=stdout, schema="nexus") return dataset
def execute_rscript(self, script): cmd = [] cmd.append("Rscript") cmd.append("--vanilla") cmd.append("-") p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = processio.communicate(p, script) if p.returncode != 0: print(script) for row in stderr.split("\n"): print("# {}".format(row)) sys.exit(p.returncode) results = {} num_lines_with_results = 0 for line in stdout.split("\n"): if not line.startswith(Rcalculator.RESULT_FLAG_LEADER): continue parts = line[len(Rcalculator.RESULT_FLAG_LEADER):].split("=") assert len(parts) == 2 key = parts[0].strip() try: value = float(parts[1].strip()) except ValueError as e: value = "NA" results[key] = value num_lines_with_results += 1 return results
def estimate_ultrametric_tree(char_matrix, topology_tree=None, paup_path=PAUP_PATH): post_est_commands = """\ set crit=likelihood; root rootmethod=midpoint; lset userbr=no nst = 1 basefreq = eq rates = eq clock =yes; lscore; """ if topology_tree is None: ultrametric_tree = estimate_tree( char_matrix, tree_est_criterion="nj", num_states=2, unequal_base_freqs=False, gamma_rates=False, prop_invar=False, extra_post_est_commands=post_est_commands) return ultrametric_tree else: paup_block = """\ set warnreset=no; exe '%(data_file)s'; gettrees file= '%(intree_file)s' warntree=no; %(post_est_commands)s; savetrees file=%(outtree_file)s format=nexus root=yes brlens=yes taxablk=yes maxdecimals=20; """ cf = tempfile.NamedTemporaryFile("w", delete=True) char_matrix.write_to_stream(cf, schema='nexus') cf.flush() input_tree_file_handle = tempfile.NamedTemporaryFile("w", delete=True) input_tree_filepath = input_tree_file_handle.name topology_tree.write_to_stream(input_tree_file_handle, schema="nexus") input_tree_file_handle.flush() # output_tree_file_handle, output_tree_filepath = tempfile.mkstemp(text=True) output_tree_file_handle = tempfile.NamedTemporaryFile("w+", delete=True) output_tree_filepath = output_tree_file_handle.name paup_args = {} paup_args["data_file"] = cf.name paup_args["intree_file"] = input_tree_filepath paup_args["post_est_commands"] = post_est_commands paup_args["outtree_file"] = output_tree_filepath paup_block = paup_block % paup_args paup_run = subprocess.Popen(['%s -n' % paup_path], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = processio.communicate(paup_run, paup_block) t = dendropy.Tree.get_from_path( output_tree_filepath, "nexus", taxon_namespace=char_matrix.taxon_namespace) cf.close() input_tree_file_handle.close() output_tree_file_handle.close() return t
def compare_one_to_many( self, ref_tree, comparison_trees, command_args=None, newick_output_kwargs=None, ): """ Compare ``ref_tree'' to each tree in ``comparison_trees``. Parameters ---------- ref_tree : |Tree| A |Tree| object to be compared to every tree in ``comparison_trees``. comparison_trees : |Tree| An (ordered) iterable of trees to which ``ref_tree`` should be compared. command_args : list or None An iterable of (string) arguments to be passed to the program. newick_output_kwargs : dict or None A collection of keyword arguments to pass to the tree string composition routines (that will generate the tree strings to be used as input to rspr). Returns ------- scores : list[numeric] A list of the SPR distances from ``ref_tree'' to ``comparison_trees``, in order of the trees given. """ if newick_output_kwargs is None: newick_output_kwargs = {} # tf = tempfile.NamedTemporaryFile("w", delete=True) tf = textprocessing.StringIO() ref_tree.write(file=tf, schema="newick", **newick_output_kwargs) for t in comparison_trees: t.write(file=tf, schema="newick", **newick_output_kwargs) command = [] command.append("rspr") # TODO: command path as instance attribute command.extend(["-pairwise", "0", "1"]) if command_args is not None: command.extend(command_args) p = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = processio.communicate(p, commands=tf.getvalue()) result_fields = stdout.strip("\n").split(",") assert len(result_fields) == 1 + len( comparison_trees ), "Expecting length {} + 1 for results, but received {}: {}".format( len(comparison_trees), len(result_fields), result_fields) return [int(v) for v in result_fields[1:]]
def estimate_ultrametric_tree( char_matrix, topology_tree=None, paup_path=PAUP_PATH): post_est_commands = """\ set crit=likelihood; root rootmethod=midpoint; lset userbr=no nst = 1 basefreq = eq rates = eq clock =yes; lscore; """ if topology_tree is None: ultrametric_tree = estimate_tree(char_matrix, tree_est_criterion="nj", num_states=2, unequal_base_freqs=False, gamma_rates=False, prop_invar=False, extra_post_est_commands=post_est_commands) return ultrametric_tree else: paup_block = """\ set warnreset=no; exe '%(data_file)s'; gettrees file= '%(intree_file)s' warntree=no; %(post_est_commands)s; savetrees file=%(outtree_file)s format=nexus root=yes brlens=yes taxablk=yes maxdecimals=20; """ cf = tempfile.NamedTemporaryFile("w", delete=True) char_matrix.write_to_stream(cf, schema='nexus') cf.flush() input_tree_file_handle = tempfile.NamedTemporaryFile("w", delete=True) input_tree_filepath = input_tree_file_handle.name topology_tree.write_to_stream(input_tree_file_handle, schema="nexus") input_tree_file_handle.flush() # output_tree_file_handle, output_tree_filepath = tempfile.mkstemp(text=True) output_tree_file_handle = tempfile.NamedTemporaryFile("w+", delete=True) output_tree_filepath = output_tree_file_handle.name paup_args = {} paup_args["data_file"] = cf.name paup_args["intree_file"] = input_tree_filepath paup_args["post_est_commands"] = post_est_commands paup_args["outtree_file"] = output_tree_filepath paup_block = paup_block % paup_args paup_run = subprocess.Popen(['%s -n' % paup_path], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = processio.communicate(paup_run, paup_block) t = dendropy.Tree.get_from_path(output_tree_filepath, "nexus", taxon_namespace=char_matrix.taxon_namespace) cf.close() input_tree_file_handle.close() output_tree_file_handle.close() return t
def muscle_align(char_matrix, muscle_args=None, muscle_path='muscle'): cmd = [muscle_path] if muscle_args: cmd = cmd + muscle_args p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(p, char_matrix.as_string("fasta")) if p.returncode: raise Exception(stderr) d = char_matrix.__class__.get_from_string( stdout, "fasta", taxon_namespace=char_matrix.taxon_namespace) return d
def estimate_niche_evolution_rate(self, trees): trees = self.tree_postprocessor.process_trees(trees) for tree_idx, tree in enumerate(trees): taxa = tree.poll_taxa() taxon_state_set_map = {} for taxon in taxa: taxon_state_set_map[taxon] = set() for idx, i in enumerate(taxon.habitat_code): if i == "1": taxon_state_set_map[taxon].add(str(idx + 1)) tree.taxon_namespace = dendropy.TaxonNamespace(taxa) for nd in tree: nd.label = None # BayesTraits gets confused with internal taxon labels, especially those with periods etc. tree.write_to_path(self.tree_file_name, "nexus", translate_tree_taxa=True) name_to_symbol_map = postprocess.NameToSymbolMap() dataf = open(self.data_file_name, "w") for taxon in taxa: row = [taxon.label] states = sorted([ name_to_symbol_map[s] for s in taxon_state_set_map[taxon] ]) row.append("".join(states)) dataf.write("{}\n".format("\t".join(row))) dataf.close() bt_commands = [] bt_commands.append("1") # multstate bt_commands.append("1") # ml; 2 == mcmc if True: #len(name_to_symbol_map.SYMBOLS) > 7: bt_commands.append("restrictall q{}{}".format( name_to_symbol_map.SYMBOLS[0], name_to_symbol_map.SYMBOLS[1])) bt_commands.append("run") # bt_commands = "\n".join(bt_commands) p = subprocess.Popen( ["BayesTraits", self.tree_file_name, self.data_file_name], stdout=subprocess.PIPE, stdin=subprocess.PIPE, ) stdout, stderr = processio.communicate(p, bt_commands) stdout = stdout.split("\n") result = dict(zip(stdout[-3].split("\t"), stdout[-2].split("\t"))) del result[''] print(result)
def muscle_align(char_matrix, muscle_args=None, muscle_path='muscle'): cmd = [muscle_path] if muscle_args: cmd = cmd + muscle_args p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(p, char_matrix.as_string("fasta")) if p.returncode: raise Exception(stderr) d = char_matrix.__class__.get_from_string(stdout, "fasta", taxon_namespace=char_matrix.taxon_namespace) return d
def execute_analysis(self, config_path, tree_path, is_use_decimal_value_type): cmd = [ os.path.join(_pathmap.BIN_DIR, "delineate-estimate-speciation-completion-rate.py"), "-c", config_path, "-t", tree_path, "-I", "-i" ] if is_use_decimal_value_type: cmd.append("--underflow-protect") p = subprocess.Popen( cmd, stdout=subprocess.PIPE, ) stdout, stderr = processio.communicate(p) return self._load_results(stdout)
def estimate_niche_evolution_rate(self, trees): trees = self.tree_postprocessor.process_trees(trees) for tree_idx, tree in enumerate(trees): taxa = tree.poll_taxa() taxon_state_set_map = {} for taxon in taxa: taxon_state_set_map[taxon] = set() for idx, i in enumerate(taxon.habitat_code): if i == "1": taxon_state_set_map[taxon].add(str(idx+1)) tree.taxon_namespace = dendropy.TaxonNamespace(taxa) for nd in tree: nd.label = None # BayesTraits gets confused with internal taxon labels, especially those with periods etc. tree.write_to_path( self.tree_file_name, "nexus", translate_tree_taxa=True) name_to_symbol_map = postprocess.NameToSymbolMap() dataf = open(self.data_file_name, "w") for taxon in taxa: row = [taxon.label] states = sorted([name_to_symbol_map[s] for s in taxon_state_set_map[taxon]]) row.append("".join(states)) dataf.write("{}\n".format("\t".join(row))) dataf.close() bt_commands = [] bt_commands.append("1") # multstate bt_commands.append("1") # ml; 2 == mcmc if True: #len(name_to_symbol_map.SYMBOLS) > 7: bt_commands.append("restrictall q{}{}".format( name_to_symbol_map.SYMBOLS[0], name_to_symbol_map.SYMBOLS[1])) bt_commands.append("run") # bt_commands = "\n".join(bt_commands) p = subprocess.Popen( ["BayesTraits", self.tree_file_name, self.data_file_name], stdout=subprocess.PIPE, stdin=subprocess.PIPE, ) stdout, stderr = processio.communicate(p, bt_commands) stdout = stdout.split("\n") result = dict(zip(stdout[-3].split("\t"), stdout[-2].split("\t"))) del result[''] print(result)
def _run_vcs(self, cmd): if textprocessing.is_str_type(cmd): cmd = self.vcs_app_path + " " + cmd else: cmd.insert(0, self.vcs_app_path) try: p = subprocess.Popen(cmd, shell=True, cwd=os.path.abspath(self.repo_path), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(p) retcode = p.returncode except OSError as e: return -999, "", str(e) return retcode, stdout, stderr
def estimate_trait_transition_rates_using_geiger(self, tree, profile_results, trait_names): self.create_geiger_traits_data(tree=tree, num_trait_types=len(trait_names)) rcmds = [] rcmds.append("library(parallel, quietly=T)") rcmds.append("library(ape, quietly=T)") rcmds.append("library(geiger, quietly=T)") rcmds.append("tree1 <- read.nexus('{}')".format(self.tree_file_name)) rcmds.append("traits <- read.csv('{}', header=F, row.names=1)".format( self.traits_data_file_name)) for trait_idx, trait_name in enumerate(trait_names): trait_var = "trait{}".format(trait_idx) rcmds.append("{} <- round(traits[,{}])".format( trait_var, trait_idx + 1)) rcmds.append("names({}) <- row.names(traits)".format(trait_var)) rcmds.append("m = fitDiscrete(tree1, {})".format(trait_var)) rcmds.append(r"cat(c(m$opt$q12), sep='\n')") rcmds = "\n".join(rcmds) rfile = open(self.commands_file_name, "w") rfile.write(rcmds + "\n") rfile.flush() rfile.close() shell_cmd = [ "R", "--vanilla", "--no-save", "--slave", "--silent", "-f", self.commands_file_name ] p = subprocess.Popen( shell_cmd, stdout=subprocess.PIPE, ) stdout, stderr = processio.communicate(p) if p.returncode != 0: if self.fail_on_estimation_error: raise Exception(p.returncode) else: rows = ["NA" for i in range(len(trait_names))] else: rows = [row.strip() for row in stdout.split("\n")] rows = [float(row) for row in rows if row] assert len(rows) == len(trait_names), rows for field_name, rate in zip(trait_names, rows): profile_results["trait.{}.est.transition.rate".format( field_name)] = rate
def estimate_dec_rates_lagrange(self, tree, profile_results, **kwargs): tree.write_to_path( self.newick_tree_file_name, "newick", suppress_rooting=True, ) self.create_lagrangecpp_geography_file( tree=tree, output_path=self.geography_data_file_name) configf = open(self.commands_file_name, "w") configf.write("treefile = {}\n".format(self.newick_tree_file_name)) configf.write("datafile = {}\n".format(self.geography_data_file_name)) configf.flush() configf.close() shell_cmd = ["lagrange_cpp", self.commands_file_name] try: p = subprocess.Popen( shell_cmd, stdout=subprocess.PIPE, ) except OSError as e: raise OSError("Failed to execute command: {}".format( " ".join(shell_cmd))) stdout, stderr = processio.communicate(p) if p.returncode != 0: if self.fail_on_estimation_error: raise Exception(p.returncode) else: profile_results["lagrange.dec.dispersal.rate"] = "NA" profile_results["lagrange.dec.extinction.rate"] = "NA" else: match = ArchipelagoProfiler.LAGRANGE_CPP_EXTRACT_PATTERN.match( stdout) if not match: if self.fail_on_estimation_error: raise Exception( "Failed to extract results from Lagrange estimation") else: profile_results["lagrange.dec.dispersal.rate"] = "NA" profile_results["lagrange.dec.extinction.rate"] = "NA" else: results = match.groups(1) profile_results["lagrange.dec.dispersal.rate"] = float( results[0]) profile_results["lagrange.dec.extinction.rate"] = float( results[1])
def estimate_trait_transition_rates_using_bayestraits( self, tree, profile_results, trait_names): for trait_idx, trait_name in enumerate(trait_names): symbols = self.create_bayestraits_traits_data( tree, trait_idx, output_path=self.traits_data_file_name) master_rate = "q{}{}".format(symbols[0], symbols[1]) bt_commands = [] bt_commands.append("1") # multstate bt_commands.append("1") # ml; 2 == mcmc bt_commands.append("restrictall {}".format(master_rate)) bt_commands.append("run") bt_commands = "\n".join(bt_commands) p = subprocess.Popen( [ "BayesTraits", self.tree_file_name, self.traits_data_file_name, ], stdout=subprocess.PIPE, stdin=subprocess.PIPE, ) stdout, stderr = processio.communicate(p, bt_commands) stdout_rows = stdout.split("\n") targeted_row_idx = None for row_idx, row in enumerate(stdout_rows): # if "q01" in row and "q10" in row: if row.startswith("Tree No\tLh\tq"): targeted_row_idx = row_idx + 1 break if targeted_row_idx is None: if self.fail_on_estimation_error: raise Exception( "Failed to extract results from BayesTraits estimation" ) else: rate = "NA" else: result = dict( zip(stdout_rows[targeted_row_idx - 1].split("\t"), stdout_rows[targeted_row_idx].split("\t"))) rate = float(result[master_rate]) profile_results["trait.{}.est.transition.rate".format( trait_name)] = rate
def estimate_pure_dispersal_weight( self, tree, profile_results, ): self.create_bayestraits_geography_file( tree, output_path=self.geography_data_file_name) bt_commands = [] bt_commands.append("1") # multstate bt_commands.append("1") # ml; 2 == mcmc bt_commands.append("restrictall q01") bt_commands.append("run") bt_commands = "\n".join(bt_commands) p = subprocess.Popen( [ "BayesTraits", self.tree_file_name, self.geography_data_file_name, ], stdout=subprocess.PIPE, stdin=subprocess.PIPE, ) stdout, stderr = processio.communicate(p, bt_commands) stdout_rows = stdout.split("\n") targeted_row_idx = None for row_idx, row in enumerate(stdout_rows): # if "q01" in row and "q10" in row: if row.startswith("Tree No\tLh\tq"): targeted_row_idx = row_idx + 1 break if targeted_row_idx is None: if self.fail_on_estimation_error: raise Exception( "Failed to extract results from BayesTraits estimation") else: rate = "NA" else: result = dict( zip(stdout_rows[targeted_row_idx - 1].split("\t"), stdout_rows[targeted_row_idx].split("\t"))) rate = float(result['q01']) profile_results["area.est.transition.rate"] = rate
def generate(self, trees, dataset=None, taxon_namespace=None, **kwargs): args=self._compose_arguments() tree_inputf = self.get_tempfile() trees.write_to_path(tree_inputf.name, "newick", suppress_rooting=True, suppress_internal_node_labels=True) tree_inputf.flush() args.append(tree_inputf.name) #_LOG.debug("seq-gen args: = %s" % " ".join(args)) run = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(run) if stderr or run.returncode != 0: raise RuntimeError("Seq-gen error: %s" % stderr) if taxon_namespace is None: taxon_namespace = trees.taxon_namespace if dataset is None: dataset = dendropy.DataSet(**kwargs) if taxon_namespace is not None: dataset.attach_taxon_namespace(taxon_namespace) dataset.read(data=stdout, schema="nexus") return dataset
def call( paup_commands, suppress_standard_preamble=False, ignore_error_returncode=False, ignore_nonempty_stderr=False, strip_extraneous_prompts_from_stdout=True, strip_extraneous_prompts_from_stderr=True, cwd=None, env=None, paup_path=PAUP_PATH ): """ Executes a sequence of commands in PAUP* and returns the results. Parameters ---------- paup_commands : iterable of strings A list or some other iterable of strings representing PAUP commands. suppress_standard_preamble : bool If |True|, then the command sequence will not be prefaced by the standard preamble. ignore_error_returncode : bool If |True|, then a non-0 return code from the PAUP process will not result in an exception being raised. ignore_nonempty_stderr : bool If |True|, then the PAUP process writing to standard error will not result in an exception being raised. strip_extraneous_prompts_from_stdout : bool If |True|, then all occurrences of 'paup>' will be removed from the standard output contents. strip_extraneous_prompts_from_stderr : bool If |True|, then all occurrences of 'paup>' will be removed from the standard error contents. cwd : string Set the working directory of the PAUP* process to this directory. env : dictionary Environmental variables to set for the PAUP* process. paup_path : string Path to the PAUP* executable. Returns ------- returncode : exit value of PAUP process. stdout : string Contents of the PAUP process standard output. stderr : string Contents of the PAUP process standard error. """ if textprocessing.is_str_type(paup_commands): commands = [paup_commands] else: commands = list(paup_commands) if not suppress_standard_preamble: commands.insert(0, STANDARD_PREAMBLE) commands.append("quit") paup_block = ";\n".join(commands) + ";\n" invocation_command = [paup_path, "-n", "-u"] p = subprocess.Popen( invocation_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, env=env, ) raw_stdout, raw_stderr = processio.communicate(p, paup_block) stdout = raw_stdout stderr = raw_stderr if strip_extraneous_prompts_from_stdout: # weird dev/paup error ... lots or prompts spring up stdout = stdout.replace("paup>", "") if strip_extraneous_prompts_from_stderr: # weird dev/paup error ... lots or prompts spring up stderr = stderr.replace("paup>", "") chk_stderr = stderr else: chk_stderr = stderr.replace("paup>", "") if (p.returncode != 0 and not ignore_error_returncode) or (chk_stderr != "" and not ignore_nonempty_stderr): raise error.ExternalServiceError( service_name="PAUP*", invocation_command=invocation_command, service_input=paup_block, returncode = p.returncode, stdout=raw_stdout, stderr=raw_stderr) return p.returncode, stdout, stderr
def estimate_tree(self, char_matrix, raxml_args=None): # set up taxa taxa = char_matrix.taxon_namespace # create working directory self._create_working_dir() # remap taxon labels self.taxon_label_map = {} self._remap_taxon_labels(taxa) # clean working directory of previous runs self._preclean_working_dir() # write input sequences raxml_seqs_filepath = os.path.join(self.working_dir_path, self.input_seq_fname) # self._send_info("Creating RAxML dummy sequences file: {}".format(raxml_seqs_filepath)) # if not self._check_overwrite(raxml_seqs_filepath): # sys.exit(0) raxml_seqs_filepath_out = open(raxml_seqs_filepath, "w") char_matrix.write_to_stream(raxml_seqs_filepath_out, "phylip") raxml_seqs_filepath_out.flush() raxml_seqs_filepath_out.close() self.files_to_clean.append(raxml_seqs_filepath) self.files_to_clean.append(raxml_seqs_filepath + ".reduced") # run RAxML if raxml_args is None: raxml_args = [] cmd = [ self.raxml_path, '-m', 'GTRCAT', '-s', raxml_seqs_filepath, '-n', self.name, '-p', str(random.randint(0, sys.maxsize)) ] + raxml_args # self._send_info("Executing: {}".format(" ".join(cmd))) if self.verbosity >= 2: stdout_pipe = None stderr_pipe = None else: stdout_pipe = subprocess.PIPE stderr_pipe = subprocess.PIPE p = subprocess.Popen(cmd, stdout=stdout_pipe, stderr=stderr_pipe, cwd=self.working_dir_path) stdout, stderr = processio.communicate(p) if p.returncode != 0: sys.stderr.write("[RAxML run failed]:\n\n%s\n\n" % (" ".join(cmd))) sys.stdout.write(stdout) sys.stderr.write(stderr) sys.exit(p.returncode) # # read result raxml_best_tree_fpath = os.path.join(self.working_dir_path, self.best_tree_fname) if not os.path.exists(raxml_best_tree_fpath): self._send_error( "RAxML result not found: {}".format(raxml_best_tree_fpath)) sys.exit(1) best_tree = dendropy.Tree.get_from_path(raxml_best_tree_fpath, "newick", taxon_namespace=taxa) # remap labels for taxon in best_tree.taxon_namespace: taxon.label = self.taxon_label_map[taxon.label] # # write results # mapped_tree.write_to_stream(self.output_dest, self.output_format) # clean-up self._postclean_working_dir() # # return result return best_tree
def estimate_tree(char_matrix, tree_est_criterion="likelihood", num_states=6, unequal_base_freqs=True, gamma_rates=True, prop_invar=True, extra_pre_est_commands=None, extra_post_est_commands=None, paup_path='paup'): """ Given a dataset, ``char_matrix``, estimates a tree using the given criterion. """ paup_args = { 'nst': num_states, 'basefreq' : unequal_base_freqs and 'estimate' or 'equal', 'rates' : gamma_rates and 'gamma' or 'equal', 'pinvar' : prop_invar and 'estimate' or '0', } cf = tempfile.NamedTemporaryFile("w", delete=True) char_matrix.write_to_stream(cf, schema='nexus') cf.flush() paup_args['datafile'] = cf.name # output_tree_file_handle, output_tree_filepath = tempfile.mkstemp(text=True) output_tree_file_handle = tempfile.NamedTemporaryFile("w+", delete=True) output_tree_filepath = output_tree_file_handle.name paup_args['est_tree_file'] = output_tree_filepath if extra_pre_est_commands: if textprocessing.is_str_type(extra_pre_est_commands): extra_pre_est_commands = [extra_pre_est_commands] paup_args["pre_est_commands"] = ";\n".join(extra_pre_est_commands) else: paup_args["pre_est_commands"] = "" if extra_post_est_commands: if textprocessing.is_str_type(extra_post_est_commands): extra_post_est_commands = [extra_post_est_commands] paup_args["post_est_commands"] = ";\n".join(extra_post_est_commands) else: paup_args["post_est_commands"] = "" paup_template = """\ set warnreset=no; exe %(datafile)s; """ if tree_est_criterion.startswith("like"): paup_template += """\ lset tratio=estimate rmatrix=estimate nst=%(nst)s basefreq=%(basefreq)s rates=%(rates)s shape=estimate pinvar=%(pinvar)s userbrlens=yes; """ if tree_est_criterion not in ["nj", "upgma"] : paup_template += """\ set crit=%s; """ % tree_est_criterion paup_template += """\ %(pre_est_commands)s; """ if tree_est_criterion in ["nj", "upgma"] : paup_template += tree_est_criterion + ";" else: paup_template += "hsearch;" paup_template += """\ %(post_est_commands)s; savetrees file=%(est_tree_file)s format=nexus root=yes brlens=yes taxablk=yes maxdecimals=20; """ paup_run = subprocess.Popen(['%s -n' % paup_path], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = processio.communicate(paup_run, paup_template % paup_args) t = dendropy.Tree.get_from_path(output_tree_filepath, "nexus", taxon_namespace=char_matrix.taxon_namespace) cf.close() output_tree_file_handle.close() return t
def run_phyml( phyml_path, char_matrix, data_type=None, parsimony_starting_tree=False, bootstrap=None, subst_model=None, amino_acid_rates=None, state_freqs=None, ts_tv_ratio=None, prop_invar=None, gamma_cats=None, gamma_shape=None, gamma_cat_median=False, free_rates=False, integrated_branch_length=False, codon_position=None, search_move=None, starting_tree=None, optimization=None, random_starting_tree=False, num_random_starting_trees=None, random_seed=None, site_likelihoods=False, trace_search=False, run_id=None, alias_subpattern=False): """ Wrapper for running PhyML via its command-line interface. A parameter value set to None will in most cases result in the PhyML default value. Check the output to verify that your analysis was set up properly. Consult the PhyML documentation for details on parameters and default values. Parameters ---------- phyml_path : str Path to PhyML executable. char_matrix : |CharacterMatrix| Matrix with data to be analyzed. data_type : str PhyML data type: "nt" (default) for nucleotide, "aa" for amino-acid sequences, or "generic". parsimony_starting_tree : bool If True, a minimum parsimony starting tree is used. This option is taken into account when `starting_tree` is False and when tree topology modifications are to be done. bootstrap : int * > 0 : the number of bootstrap replicates to generate. * 0 : neither approximate likelihood ratio test nor bootstrap values will be computed. * -1 : approximate likelihood ratio test returning aLRT statistics. * -2 : approximate likelihood ratio test returning Chi2-based. parametric branch supports. * -4 : SH-like branch supports alone. * -5 : (default) approximate Bayes branch supports. subst_model : str Substitution model name. * Nucleotide-based models : "HKY85" (default), "JC69", "K80", "F81", "F84", "TN93", "GTR", or a custom GTR-family model, e.g. "00000". * Amino-acid based models : "LG" (default), "WAG" ,"JTT", "MtREV", "Dayhoff", "DCMut", "RtREV", "CpREV", "VT", "AB", "Blosum62", "MtMam", "MtArt", "HIVw", "HIVb", "custom". amino_acid_rates : str amino acid substitution rate matrix in PAML format. It is compulsory to use this option when analyzing amino acid sequences with the "custom" substitution model. state_freqs : str or list of floats * "e" : the character frequencies will be determined as follows : - Nucleotide sequences: (Empirical) the equilibrium base frequencies are estimated by counting the occurence of the different bases in the alignment. - Amino-acid sequences: (Empirical) the equilibrium amino-acid frequencies are estimated by counting the occurence of the different amino-acids in the alignment. * "m" : the character frequencies are determined as follows : - Nucleotide sequences: (ML) the equilibrium base frequencies are estimated using maximum likelihood. - Amino-acid sequences: (Model) the equilibrium amino-acid frequencies are estimated using the frequencies defined by the substitution model. * "fA,fC,fG,fT" : only valid for nucleotide-based models. fA, fC, fG and fT are floating numbers that correspond to the frequencies of A, C, G and T respectively (WARNING: do not use any blank space between your values of nucleotide frequencies, only commas!) ts_tv : float or str transition/transversion ratio. DNA sequences only. Can be a fixed positive value (ex: 4.0) or "e" to get the maximum likelihood estimate. prop_invar : float or str proportion of invariable sites. Can be a fixed value in the [0,1] range or "e" to get the maximum likelihood estimate. gamma_cats : int number of relative substitution rate categories. Must be a positive integer. Default value 4. gamma_shape : float or str distribution of the gamma distribution shape parameter. Can be a fixed positive value or "e" to get the maximum likelihood estimate. gamma_cat_median : bool If True, use median instead of mean as the middle of each substitution rate class in the discrete gamma distribution. free_rates : bool If True, the FreeRate model of substitution rate variation across sites will be used. integrated_branch_length : bool If True, the integrated length (IL) model will be used. The IL model can be considered as an approximation to the covarion model. codon_position : {1, 2, 3} When analyzing an alignment of coding sequences, use this option to consider only the first, second or the third coding position. search_move : {"NNI", "SPR", "BEST"} Tree topology search operation option. Can be either "NNI" (default, fast) or "SPR" (a bit slower than NNI) or "BEST" (best of NNI and SPR search). starting_tree : |Tree| User-provided starting tree. optimization : {"tlr", "tl", "lr", "l", "r", "n"} Specify which parameters to optmimize. Tree topology (t), branch lengths (l), rate parameters (r) and no parameter (n). random_starting_tree : bool If True, sets the initial tree to random. It is only valid if SPR searches are to be performed. num_random_starting_trees : int Number of initial random trees to be used. It is only valid if SPR searches are to be performed. random_seed : int Seed used to initiate the random number generator. site_likelihoods : bool If True, return likelood for each site. trace_search : bool If True, return each phylogeny explored during the tree search. run_id : str Append an ID-string to the PhyML output. alias_subpattern : bool If True, site aliasing is generalized at the subtree level. Sometimes lead to faster calculations. See Kosakovsky Pond SL, Muse SV, Sytematic Biology (2004) for an example. Returns ------- result : :class:`~dendropy.interop.phyml.PhymlResult` """ char_matrix_f = tempfile.NamedTemporaryFile() # Compose arguments args = [] args.append(phyml_path) args.append("-i%s" % char_matrix_f.name) if data_type: args.append("-d%s" % str(data_type)) args.append("-q") args.append("-n1") if parsimony_starting_tree: args.append("-p") if bootstrap: args.append("-b%s" % str(bootstrap)) if subst_model: args.append("-m%s" % str(subst_model)) if amino_acid_rates: args.extend(["--aa_rate_file", char_matrix_f.name + "_aa_rate"]) if state_freqs: if isinstance(state_freqs, str): args.append("-f%s" % state_freqs) else: args.append("-f%s" % (",".join([str(s) for s in state_freqs]))) if ts_tv_ratio: args.append("-t%s" % str(ts_tv_ratio)) if prop_invar: args.append("-v%s" % str(prop_invar)) if gamma_cats: args.append("-c%s" % str(gamma_cats)) if gamma_shape: args.append("-a%s" % str(gamma_shape)) if gamma_cat_median: args.append("--use_median") if free_rates: args.append("--freerates") if integrated_branch_length: args.append("--il") if codon_position: args.extend(["--codpos", str(codon_position)]) if search_move: args.append("-s%s" % str(search_move)) if starting_tree: args.append("-u%s" % str(char_matrix_f.name + "_starting_tree")) if optimization: args.append("-o%s" % str(optimization)) if random_starting_tree: args.append("--rand_start") if num_random_starting_trees: args.extend(["--n_rand_starts", str(num_random_starting_trees)]) if site_likelihoods: args.append("--print_site_lnl") if random_seed: args.extend(["--r_seed%s" % str(random_seed)]) if trace_search: args.append("--print_trace") if run_id: args.extend(["--run_id", run_id]) args.append("--quiet") args.append("--no_memory_check") if alias_subpattern: args.append("--alias_subpatt") command_line = " ".join(args) try: # Write data to files char_matrix.write_to_path( char_matrix_f.name, "phylip", spaces_to_underscores=True) if starting_tree: starting_tree.write_to_path( char_matrix_f.name + "_starting_tree", "newick", preserve_spaces=False) if amino_acid_rates: with open(char_matrix_f.name + "_aa_rate", "w") as aa_rate_f: aa_rate_f.write(str(amino_acid_rates)) # Call PhyML proc = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(proc) # Check output if stderr or proc.returncode != 0: if stderr: raise RuntimeError( "PhyML error: %s\n%s" % (command_line, stderr)) else: raise RuntimeError("PhyML error: %s" % stdout) else: # Collect output output_files = {} result = PhymlResult() result.command_line = command_line result.stdout_text = stdout output_files["_phyml_tree"], result.best_tree = ( _read_phyml_file(char_matrix_f.name, "_phyml_tree", "tree")) output_files["_phyml_stats"], result.stats_text = ( _read_phyml_file(char_matrix_f.name, "_phyml_stats", "text")) if bootstrap and bootstrap > 0: output_files["_phyml_boot_trees"], result.boot_trees = ( _read_phyml_file( char_matrix_f.name, "_phyml_boot_trees", "treelist")) output_files["_phyml_boot_stats"], result.boot_stats_text = ( _read_phyml_file( char_matrix_f.name, "_phyml_boot_stats", "text")) if random_starting_tree: output_files["_phyml_rand_trees"], result.rand_trees = ( _read_phyml_file( char_matrix_f.name, "_phyml_rand_trees", "treelist")) if site_likelihoods: output_files["_phyml_lk"], result.site_likelihoods_text = ( _read_phyml_file(char_matrix_f.name, "_phyml_lk", "text")) if trace_search: output_files["_phyml_trace"], result.search_trace_trees = ( _read_phyml_file( char_matrix_f.name, "_phyml_trace", "treelist")) result.output_files = output_files finally: # Clean up char_matrix_f.close() for phyml_file in glob.glob(char_matrix_f.name + "*"): os.remove(phyml_file) return result
def call(r_commands, ignore_error_returncode=False, cwd=None, env=None, rscript_path=RSCRIPT_EXECUTABLE, ): """ Executes a sequence of commans in R and returns the results. Note that newlines ('\n') and other special characters will be converted before being passed to the R interpreter, so need to be escaped or entered as raw string expressions. That is, instead of, e.g.: returncode, stdout, stderr = RService.call([ "cat('hello, world\n')", ]) use this: returncode, stdout, stderr = RService.call([ "cat('hello, world\\n')", ]) or: returncode, stdout, stderr = RService.call([ r"cat('hello, world\n')", ]) Parameters ---------- r_commands : iterable of strings A list or some other iterable of strings of R commands. ignore_error_returncode : bool If `True`, then a non-0 return code from the R process will not result in an exception being raised. cwd : string Set the working directory of the R process to this directory. env : dictionary Environmental variables to set for the R process. rscript_path : string Path to the Rscript executable. Returns ------- returncode : exit value of the R process stdout : string Contents of the R process standard output. stderr : string Contents of the R process standard error. """ if not isinstance(r_commands, str): r_commands = "\n".join(r_commands) r_commands += "\n" invocation_command = [RSCRIPT_EXECUTABLE, rsubprocess_pipe_path] p = subprocess.Popen( invocation_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, env=env, ) stdout, stderr = processio.communicate(p, r_commands) if (p.returncode != 0 and not ignore_error_returncode): raise error.ExternalServiceError( service_name="Rscript", invocation_command=invocation_command, service_input=r_commands, returncode = p.returncode, stdout=stdout, stderr=stderr) return p.returncode, stdout, stderr
def call(r_commands, ignore_error_returncode=False, cwd=None, env=None, rscript_path=RSCRIPT_EXECUTABLE, ): """ Executes a sequence of commands in R and returns the results. All the noise is sunk into the stderr return variable, and just the output comes out cleanly in the stdout return variable. Parameters ---------- r_commands : iterable of strings A list or some other iterable of strings of R commands. ignore_error_returncode : bool If |True|, then a non-0 return code from the R process will not result in an exception being raised. cwd : string Set the working directory of the R process to this directory. env : dictionary Environmental variables to set for the R process. rscript_path : string Path to the Rscript executable. Returns ------- returncode : exit value of the R process stdout : string Contents of the R process standard output. stderr : string Contents of the R process standard error. Examples -------- Build up a script (``s``) to calculate a range of values, print them to the standard output, and then post-process this to extract the values:: import itertools from dendropy.interop import rstats bb = [0.01, 0.05, 0.10, 0.50, 1.0] cc = [0.01, 0.05, 0.10, 0.50, 1.0] ee = [0.0, 0.1, 0.2] # store commands of script as a list # to be passed to the ``call()`` s = [] # set options, load required libraries, etc. s.append("options(digits=22)") s.append("library(PBD)") # build up list of commands in script params = [] for b, c, e in itertools.product(bb, cc, ee): s.append("print(pbd_durspec_mean(pars=c({},{},{})))".format(b, c, e)) # execute script returncode, stdout, stderr = rstats.call(s) # peek at the results print(stdout) # [1] 69.31472 # [1] 9.853723 # [1] 4.981369 # [1] 0.9950331 # ... # post-process the stdout to extract values results = [float(x.split(" ")[1]) for x in stdout.split("\n") if x] Notes ----- Note that newlines ('\n') and other special characters will be converted before being passed to the R interpreter, so need to be escaped or entered as raw string expressions. That is, instead of, e.g.:: returncode, stdout, stderr = RService.call([ "cat('hello, world\n')", ]) use this:: returncode, stdout, stderr = RService.call([ "cat('hello, world\\n')", ]) or:: returncode, stdout, stderr = RService.call([ r"cat('hello, world\n')", ]) """ if not textprocessing.is_str_type(r_commands): r_commands = "\n".join(r_commands) r_commands += "\n" invocation_command = [RSCRIPT_EXECUTABLE, rsubprocess_pipe_path] p = subprocess.Popen( invocation_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, env=env, ) stdout, stderr = processio.communicate(p, r_commands) if (p.returncode != 0 and not ignore_error_returncode): raise error.ExternalServiceError( service_name="Rscript", invocation_command=invocation_command, service_input=r_commands, returncode = p.returncode, stdout=stdout, stderr=stderr) return p.returncode, stdout, stderr
def pscore_trees( trees, char_matrix, pset_option_list=None, pscore_option_list=None, paup_path=PAUP_PATH): if pset_option_list is not None: pset = "pset " + " ".join(pset_option_list) else: pset = "" scorefile = tempfile.NamedTemporaryFile("w+", delete=True) pscore_command = "pscore / scorefile={}".format(scorefile.name) if pscore_option_list is not None: pscore_command = pscore_command + " ".join(pscore_option_list) else: pscore_command = pscore_command post_est_commands = """\ set crit=parsimony; {pset} {pscore_command} """.format(pset=pset, pscore_command=pscore_command) paup_block = """\ set warnreset=no; exe '{data_file}'; gettrees file= '{intree_file}' warntree=no; {post_est_commands}; """ cf = tempfile.NamedTemporaryFile("w", delete=True) char_matrix.write_to_stream(cf, schema='nexus') cf.flush() input_tree_file_handle = tempfile.NamedTemporaryFile("w", delete=True) input_tree_filepath = input_tree_file_handle.name trees.write_to_stream(input_tree_file_handle, schema="nexus") input_tree_file_handle.flush() paup_args = {} paup_args["data_file"] = cf.name paup_args["intree_file"] = input_tree_filepath paup_args["post_est_commands"] = post_est_commands paup_block = paup_block.format(**paup_args) paup_run = subprocess.Popen(['%s -n' % paup_path], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = processio.communicate(paup_run, paup_block) if stderr: sys.stderr.write("\n*** ERROR FROM PAUP ***") sys.stderr.write(stderr) sys.exit(1) scores_str = open(scorefile.name, "r").read() score_rows = [r for r in scores_str.split("\n")[1:] if r != ""] assert len(score_rows) == len(trees) scores = [int(s.split()[1]) for s in score_rows] assert len(scores) == len(trees) cf.close() input_tree_file_handle.close() scorefile.close() return scores
def map_bipartitions(self, target_tree_fpath, bootstrap_trees_fpaths): # set up taxa taxa = dendropy.TaxonNamespace() taxon_label_map = {} # read target tree target_tree_fpath = self._expand_path(target_tree_fpath) # self._send_info("Reading target tree file: {}".format(target_tree_fpath)) target_tree = self._get_trees(target_tree_fpath, taxon_namespace=taxa)[0] # read boostrap trees boot_trees = dendropy.TreeList() for fpath in bootstrap_trees_fpaths: fpath = self._expand_path(fpath) # self._send_info("Reading bootstrap tree file: {}".format(fpath)) self._get_trees(tree_filepath=fpath, tree_list=boot_trees, taxon_namespace=taxa) # self._send_info("Read: {} taxa, {} bootstrap trees".format(len(taxa), len(boot_trees))) # create working directory self._create_working_dir() # remap taxon labels self.taxon_label_map = {} self._remap_taxon_labels(taxa) # write input target tree raxml_target_tree_filepath = os.path.join( self.working_dir_path, "{}.target_tree".format(self.name)) # self._send_info("Creating RAxML target tree file: {}".format(raxml_target_tree_filepath)) if not self._check_overwrite(raxml_target_tree_filepath): sys.exit(0) target_tree.write_to_path(raxml_target_tree_filepath, "newick") self.files_to_clean.append(raxml_target_tree_filepath) # write input bootstrap trees raxml_bootstrap_trees_filepath = os.path.join( self.working_dir_path, "{}.boot_trees".format(self.name)) # self._send_info("Creating RAxML bootstrap tree file: {}".format(raxml_bootstrap_trees_filepath)) if not self._check_overwrite(raxml_bootstrap_trees_filepath): sys.exit(0) boot_trees.write_to_path(raxml_bootstrap_trees_filepath, "newick") self.files_to_clean.append(raxml_bootstrap_trees_filepath) # write input (dummy) sequences raxml_seqs_filepath = os.path.join(self.working_dir_path, "{}.seqs".format(self.name)) # self._send_info("Creating RAxML dummy sequences file: {}".format(raxml_seqs_filepath)) if not self._check_overwrite(raxml_seqs_filepath): sys.exit(0) raxml_seqs_filepath_out = open(raxml_seqs_filepath, "w") self._write_dummy_seqs(taxa, raxml_seqs_filepath_out) raxml_seqs_filepath_out.flush() raxml_seqs_filepath_out.close() self.files_to_clean.append(raxml_seqs_filepath) # clean working directory of previous runs self._preclean_working_dir() # run RAxML cmd = [ self.raxml_path, '-f', 'b', '-t', os.path.basename(raxml_target_tree_filepath), '-z', os.path.basename(raxml_bootstrap_trees_filepath), '-s', os.path.basename(raxml_seqs_filepath), '-m', 'GTRCAT', '-n', self.name ] # self._send_info("Executing: {}".format(" ".join(cmd))) if self.verbosity >= 2: stdout_pipe = None stderr_pipe = None else: stdout_pipe = subprocess.PIPE stderr_pipe = subprocess.PIPE p = subprocess.Popen(cmd, stdout=stdout_pipe, stderr=stderr_pipe, cwd=self.working_dir_path) stdout, stderr = processio.communicate(p) if p.returncode != 0: self._send_error("RAxML run failed") if self.verbosity < 2: sys.stdout.write(stdout) sys.stderr.write(stderr) sys.exit(p.returncode) # read result raxml_mapped_tree_fpath = os.path.join(self.working_dir_path, self.bipartitions_fname) if not os.path.exists(raxml_mapped_tree_fpath): self._send_error( "RAxML result not found: {}".format(raxml_mapped_tree_fpath)) sys.exit(1) mapped_tree = dendropy.Tree.get_from_path(raxml_mapped_tree_fpath, "newick") # remap labels for taxon in mapped_tree.taxon_namespace: taxon.label = taxon_label_map[taxon.label] # # write results # mapped_tree.write_to_stream(self.output_dest, self.output_format) # clean-up self.files_to_clean.append(raxml_mapped_tree_fpath) self.files_to_clean.append(self.info_fname) self._postclean_working_dir() # return result return mapped_tree
def estimate_dec(self, newick_tree_filepath, geography_filepath, max_range_size, **kwargs ): param_settings = [] for param_name in ("b", "e", "d", "j", "ysv", "y", "s", "v"): if "fixed_" + param_name in kwargs: param_settings.append(PARAM_SETTING_TEMPLATE.format( param_name=param_name, param_aspect="type", value='"fixed"')) for param_aspect in ("min", "max", "init", "est"): param_settings.append(PARAM_SETTING_TEMPLATE.format( param_name=param_name, param_aspect=param_aspect, value=kwargs["fixed_"+param_name])) else: for param_aspect in ("min_", "max_", "init_", "est_"): if param_aspect + param_name in kwargs: param_settings.append(PARAM_SETTING_TEMPLATE.format( param_name=param_name, param_aspect=param_aspect[:-1], value=kwargs[param_aspect+param_name])) param_settings = "\n".join(param_settings) rcmds = R_TEMPLATE.format( patch_code=self.patch_code, param_settings=param_settings, tree_filepath=newick_tree_filepath, geography_filepath=geography_filepath, max_range_size=max_range_size, results_filepath=self.results_file_name, ) rfile = open(self.commands_file_name, "w") rfile.write(rcmds + "\n") rfile.flush() rfile.close() shell_cmd = ["R", "--vanilla", "--no-save", "--slave", "--silent", "-f", self.commands_file_name] p = subprocess.Popen( shell_cmd, stdout=subprocess.PIPE if not self.debug_mode else None, stderr=subprocess.PIPE if not self.debug_mode else None, ) stdout, stderr = processio.communicate(p) if p.returncode != 0: if self.fail_on_estimation_error: raise Exception("Non-zero return code: {}\n{}\n{}".format( p.returncode, stdout, stderr, )) else: return None results_rows = open(self.results_file_name, "r").read().split("\n") results_table = collections.OrderedDict() for row in results_rows[1:21]: cols = row.split() if cols[0] == "desc" or cols[0] == "note": break try: results_table[cols[0]] = float(cols[5]) except IndexError: raise IndexError(cols) return results_table
def estimate_model(char_matrix, tree_model=None, num_states=6, unequal_base_freqs=True, gamma_rates=True, prop_invar=True, tree_est_criterion="likelihood", tree_user_brlens=True, paup_path='paup'): """ Given a dataset, ``char_matrix``, uses client-supplied tree or estimates a tree, and character substitution model for the data. Returns a tuple, consisting of a trees block with the tree(s) used for the estimated character model, and a dictionary with estimates of rates, kappa, base_frequencies, alpha, prop_invar, etc. as well as likelihood. """ paup_args = { 'nst': num_states, 'basefreq' : unequal_base_freqs and 'estimate' or 'equal', 'rates' : gamma_rates and 'gamma' or 'equal', 'pinvar' : prop_invar and 'estimate' or '0', } if tree_model is not None: assert tree_model.taxon_namespace is char_matrix.taxon_namespace tf = tempfile.NamedTemporaryFile("w", delete=True) tree_model.write_to_stream(tf, 'nexus') tf.flush() paup_args['tree'] = "gettrees file=%s storebrlens=yes;" % tf.name else: if tree_est_criterion in ["nj", "upgma"] : paup_args['tree'] = tree_est_criterion else: paup_args['tree'] = "set crit=%s; hsearch; set crit=like;" % tree_est_criterion if tree_user_brlens: paup_args['userbrlens'] = 'yes' else: paup_args['userbrlens'] = 'no' cf = tempfile.NamedTemporaryFile("w", delete=True) char_matrix.write_to_stream(cf, schema='nexus') cf.flush() paup_args['datafile'] = cf.name # output_tree_file_handle, output_tree_filepath = tempfile.mkstemp(text=True) output_tree_file_handle = tempfile.NamedTemporaryFile("w+", delete=True) output_tree_filepath = output_tree_file_handle.name paup_args['est_tree_file'] = output_tree_filepath paup_template = """\ set warnreset=no; exe %(datafile)s; set crit=like; lset tratio=estimate rmatrix=estimate nst=%(nst)s basefreq=%(basefreq)s rates=%(rates)s shape=estimate pinvar=%(pinvar)s userbrlens=%(userbrlens)s; %(tree)s; lscore 1 / userbrlens=%(userbrlens)s; savetrees file=%(est_tree_file)s format=nexus root=yes brlens=yes taxablk=yes maxdecimals=20; """ paup_run = subprocess.Popen(['%s -n' % paup_path], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = processio.communicate(paup_run, paup_template % paup_args) results = {} patterns = { 'likelihood' : re.compile('-ln L\s+([\d\.]+)'), 'rAC' : re.compile(' AC\s+([\d\.]+)'), 'rAG' : re.compile(' AG\s+([\d\.]+)'), 'rAT' : re.compile(' AT\s+([\d\.]+)'), 'rCG' : re.compile(' CG\s+([\d\.]+)'), 'rCT' : re.compile(' CT\s+([\d\.]+)'), 'rGT' : re.compile(' GT\s+([\d\.]+)'), 'kappa': re.compile(' kappa\s+([\d\.]+)'), 'prop_invar' : re.compile('P_inv\s+([\d\.]+)'), 'alpha' : re.compile('Shape\s+([\S]+)'), 'pA' : re.compile(' A\s+([\d\.]+)'), 'pC' : re.compile(' C\s+([\d\.]+)'), 'pG' : re.compile(' G\s+([\d\.]+)'), 'pT' : re.compile(' T\s+([\d\.]+)'), } for value_name in patterns: results[value_name] = None for line in stdout.split('\n'): for value_name in patterns: m = patterns[value_name].match(line) if m: results[value_name] = m.group(1) for value_name in results.keys(): if value_name == 'likelihood': results[value_name] = -1 * float(results[value_name]) results["log_likelihood"] = results[value_name] elif results[value_name] is not None: try: results[value_name] = float(results[value_name]) except: pass t = dendropy.Tree.get_from_path(output_tree_filepath, "nexus", taxon_namespace=char_matrix.taxon_namespace) cf.close() output_tree_file_handle.close() return t, results
def estimate_tree(self, char_matrix, raxml_args=None): # set up taxa taxa = char_matrix.taxon_namespace # create working directory self._create_working_dir() # remap taxon labels self.taxon_label_map = {} self._remap_taxon_labels(taxa) # clean working directory of previous runs self._preclean_working_dir() # write input sequences raxml_seqs_filepath = os.path.join(self.working_dir_path, self.input_seq_fname) # self._send_info("Creating RAxML dummy sequences file: {}".format(raxml_seqs_filepath)) # if not self._check_overwrite(raxml_seqs_filepath): # sys.exit(0) raxml_seqs_filepath_out = open(raxml_seqs_filepath, "w") char_matrix.write_to_stream(raxml_seqs_filepath_out, "phylip") raxml_seqs_filepath_out.flush() raxml_seqs_filepath_out.close() self.files_to_clean.append(raxml_seqs_filepath) self.files_to_clean.append(raxml_seqs_filepath + ".reduced") # run RAxML if raxml_args is None: raxml_args = [] cmd = [self.raxml_path, '-m', 'GTRCAT', '-s', raxml_seqs_filepath, '-n', self.name, '-p', str(random.randint(0, sys.maxsize))] + raxml_args # self._send_info("Executing: {}".format(" ".join(cmd))) if self.verbosity >= 2: stdout_pipe = None stderr_pipe = None else: stdout_pipe = subprocess.PIPE stderr_pipe = subprocess.PIPE p = subprocess.Popen(cmd, stdout=stdout_pipe, stderr=stderr_pipe, cwd=self.working_dir_path) stdout, stderr = processio.communicate(p) if p.returncode != 0: sys.stderr.write("[RAxML run failed]:\n\n%s\n\n" % (" ".join(cmd))) sys.stdout.write(stdout) sys.stderr.write(stderr) sys.exit(p.returncode) # # read result raxml_best_tree_fpath = os.path.join(self.working_dir_path, self.best_tree_fname) if not os.path.exists(raxml_best_tree_fpath): self._send_error("RAxML result not found: {}".format(raxml_best_tree_fpath)) sys.exit(1) best_tree = dendropy.Tree.get_from_path(raxml_best_tree_fpath, "newick", taxon_namespace=taxa) # remap labels for taxon in best_tree.taxon_namespace: taxon.label = self.taxon_label_map[taxon.label] # # write results # mapped_tree.write_to_stream(self.output_dest, self.output_format) # clean-up self._postclean_working_dir() # # return result return best_tree
def map_bipartitions(self, target_tree_fpath, bootstrap_trees_fpaths): # set up taxa taxa = dendropy.TaxonNamespace() taxon_label_map = {} # read target tree target_tree_fpath = self._expand_path(target_tree_fpath) # self._send_info("Reading target tree file: {}".format(target_tree_fpath)) target_tree = self._get_trees(target_tree_fpath, taxon_namespace=taxa)[0] # read boostrap trees boot_trees = dendropy.TreeList() for fpath in bootstrap_trees_fpaths: fpath = self._expand_path(fpath) # self._send_info("Reading bootstrap tree file: {}".format(fpath)) self._get_trees(tree_filepath=fpath, tree_list=boot_trees, taxon_namespace=taxa) # self._send_info("Read: {} taxa, {} bootstrap trees".format(len(taxa), len(boot_trees))) # create working directory self._create_working_dir() # remap taxon labels self.taxon_label_map = {} self._remap_taxon_labels(taxa) # write input target tree raxml_target_tree_filepath = os.path.join(self.working_dir_path, "{}.target_tree".format(self.name)) # self._send_info("Creating RAxML target tree file: {}".format(raxml_target_tree_filepath)) if not self._check_overwrite(raxml_target_tree_filepath): sys.exit(0) target_tree.write_to_path(raxml_target_tree_filepath, "newick") self.files_to_clean.append(raxml_target_tree_filepath) # write input bootstrap trees raxml_bootstrap_trees_filepath = os.path.join(self.working_dir_path, "{}.boot_trees".format(self.name)) # self._send_info("Creating RAxML bootstrap tree file: {}".format(raxml_bootstrap_trees_filepath)) if not self._check_overwrite(raxml_bootstrap_trees_filepath): sys.exit(0) boot_trees.write_to_path(raxml_bootstrap_trees_filepath, "newick") self.files_to_clean.append(raxml_bootstrap_trees_filepath) # write input (dummy) sequences raxml_seqs_filepath = os.path.join(self.working_dir_path, "{}.seqs".format(self.name)) # self._send_info("Creating RAxML dummy sequences file: {}".format(raxml_seqs_filepath)) if not self._check_overwrite(raxml_seqs_filepath): sys.exit(0) raxml_seqs_filepath_out = open(raxml_seqs_filepath, "w") self._write_dummy_seqs(taxa, raxml_seqs_filepath_out) raxml_seqs_filepath_out.flush() raxml_seqs_filepath_out.close() self.files_to_clean.append(raxml_seqs_filepath) # clean working directory of previous runs self._preclean_working_dir() # run RAxML cmd = [self.raxml_path, '-f', 'b', '-t', os.path.basename(raxml_target_tree_filepath), '-z', os.path.basename(raxml_bootstrap_trees_filepath), '-s', os.path.basename(raxml_seqs_filepath), '-m', 'GTRCAT', '-n', self.name] # self._send_info("Executing: {}".format(" ".join(cmd))) if self.verbosity >= 2: stdout_pipe = None stderr_pipe = None else: stdout_pipe = subprocess.PIPE stderr_pipe = subprocess.PIPE p = subprocess.Popen(cmd, stdout=stdout_pipe, stderr=stderr_pipe, cwd=self.working_dir_path) stdout, stderr = processio.communicate(p) if p.returncode != 0: self._send_error("RAxML run failed") if self.verbosity < 2: sys.stdout.write(stdout) sys.stderr.write(stderr) sys.exit(p.returncode) # read result raxml_mapped_tree_fpath = os.path.join(self.working_dir_path, self.bipartitions_fname) if not os.path.exists(raxml_mapped_tree_fpath): self._send_error("RAxML result not found: {}".format(raxml_mapped_tree_fpath)) sys.exit(1) mapped_tree = dendropy.Tree.get_from_path(raxml_mapped_tree_fpath, "newick") # remap labels for taxon in mapped_tree.taxon_namespace: taxon.label = taxon_label_map[taxon.label] # # write results # mapped_tree.write_to_stream(self.output_dest, self.output_format) # clean-up self.files_to_clean.append(raxml_mapped_tree_fpath) self.files_to_clean.append(self.info_fname) self._postclean_working_dir() # return result return mapped_tree