def coerce_string_to_nice_outfilename(p, reason, default): illegal_filename_pattern = re.compile(r'[^-_a-zA-Z0-9.]') j = "".join(illegal_filename_pattern.split(p)) if not j: j = default if j != p: MESSENGER.send_warning('%s name changed from "%s" to "%s" (a safer name for filepath)' % (reason, p, j)) return j
def killed_handler(n, frame): global _RunningJobs if _RunningJobs: MESSENGER.send_warning("signal killed_handler called. Killing running jobs...\n") j = _RunningJobs j.kill() else: MESSENGER.send_warning("signal killed_handler called with no jobs running. Exiting.\n") sys.exit()
def coerce_string_to_nice_outfilename(p, reason, default): illegal_filename_pattern = re.compile(r'[^-_a-zA-Z0-9.]') j = "".join(illegal_filename_pattern.split(p)) if not j: j = default if j != p: MESSENGER.send_warning( '%s name changed from "%s" to "%s" (a safer name for filepath)' % (reason, p, j)) return j
def killed_handler(n, frame): global _RunningJobs if _RunningJobs: MESSENGER.send_warning("signal killed_handler called. Killing running jobs...\n") if isinstance(_RunningJobs, list): for j in _RunningJobs: j.kill() MESSENGER.send_warning("kill called...\n") else: j = _RunningJobs j.kill() MESSENGER.send_warning("kill called...\n") else: MESSENGER.send_warning("signal killed_handler called with no jobs running. Exiting.\n") sys.exit()
def killed_handler(n, frame): global _RunningJobs if _RunningJobs: MESSENGER.send_warning( "signal killed_handler called. Killing running jobs...\n") if isinstance(_RunningJobs, list): for j in _RunningJobs: j.kill() MESSENGER.send_warning("kill called...\n") else: j = _RunningJobs j.kill() MESSENGER.send_warning("kill called...\n") else: MESSENGER.send_warning( "signal killed_handler called with no jobs running. Exiting.\n") sys.exit()
def finish_sate_execution(sate_team, user_config, temporaries_dir, multilocus_dataset, sate_products): global _RunningJobs # get the RAxML model #TODO: this should check for the tree_estimator. Currently we only support raxml, so this works... model = user_config.raxml.model options = user_config.commandline user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg')) if options.timesfile: f = open_with_intermediates(options.timesfile, 'a') f.close() set_timing_log_filepath(options.timesfile) ############################################################################ # We must read the incoming tree in before we call the get_sequences_for_sate # function that relabels that taxa in the dataset ###### tree_file = options.treefile if tree_file: if not os.path.exists(tree_file): raise Exception('The tree file "%s" does not exist' % tree_file) tree_f = open(tree_file, 'rU') MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file) tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f) tree_f.close() if len(tree_list) > 1: MESSENGER.send_warning('%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file)) starting_tree = tree_list[0] score = None ############################################################################ # This will relabel the taxa if they have problematic names ##### multilocus_dataset.relabel_for_sate() options.aligned = all( [i.is_aligned() for i in multilocus_dataset] ) ############################################################################ # Launch threads to do work ##### sate_config = user_config.get("sate") start_worker(sate_config.num_cpus) ############################################################################ # Be prepared to kill any long running jobs ##### prev_signals = [] for sig in [signal.SIGTERM, signal.SIGABRT, signal.SIGINT]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]: prev_handler = signal.signal(sig, killed_handler) prev_signals.append((sig, prev_handler)) try: if tree_file: # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels starting_tree_str = starting_tree.compose_newick() else: MESSENGER.send_info("Performing initial tree search to get starting tree...") if not options.aligned: MESSENGER.send_info("Performing initial alignment of the entire data matrix...") init_aln_dir = os.path.join(temporaries_dir, 'init_aln') init_aln_dir = sate_team.temp_fs.create_subdir(init_aln_dir) delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps) new_alignment_list= [] for unaligned_seqs in multilocus_dataset: job = sate_team.aligner.create_job(unaligned_seqs, tmp_dir_par=init_aln_dir, context_str="initalign", delete_temps=delete_aln_temps) _RunningJobs = job jobq.put(job) new_alignment = job.get_results() _RunningJobs = None new_alignment_list.append(new_alignment) for locus_index, new_alignment in enumerate(new_alignment_list): multilocus_dataset[locus_index] = new_alignment if delete_aln_temps: sate_team.temp_fs.remove_dir(init_aln_dir) else: MESSENGER.send_info("Input sequences assumed to be aligned (based on sequence lengths).") MESSENGER.send_info("Performing initial tree search to get starting tree...") init_tree_dir = os.path.join(temporaries_dir, 'init_tree') init_tree_dir = sate_team.temp_fs.create_subdir(init_tree_dir) delete_tree_temps = not options.keeptemp job = sate_team.tree_estimator.create_job(multilocus_dataset, tmp_dir_par=init_tree_dir, num_cpus=sate_config.num_cpus, context_str="inittree", delete_temps=delete_tree_temps) _RunningJobs = job jobq.put(job) score, starting_tree_str = job.get_results() _RunningJobs = None if delete_tree_temps: sate_team.temp_fs.remove_dir(init_tree_dir) _LOG.debug('We have the tree and whole_alignment, partitions...') sate_config_dict = sate_config.dict() if options.keeptemp: sate_config_dict['keep_iteration_temporaries'] = True if options.keepalignmenttemps: sate_config_dict['keep_realignment_temporaries'] = True job = SateJob(multilocus_dataset=multilocus_dataset, sate_team=sate_team, name=options.job, status_messages=MESSENGER.send_info, **sate_config_dict) job.tree_str = starting_tree_str if score is not None: job.store_optimum_results(new_multilocus_dataset=multilocus_dataset, new_tree_str=starting_tree_str, new_score=score, curr_timestamp=time.time()) _RunningJobs = job MESSENGER.send_info("Starting SATe algorithm on initial tree...") job.run(tmp_dir_par=temporaries_dir) _RunningJobs = None job.multilocus_dataset.restore_taxon_names() assert len(sate_products.alignment_streams) == len(job.multilocus_dataset) for i, alignment in enumerate(job.multilocus_dataset): alignment_stream = sate_products.alignment_streams[i] MESSENGER.send_info("Writing final alignment to %s" % alignment_stream.name) alignment.write(alignment_stream, file_format="FASTA") alignment_stream.close() MESSENGER.send_info("Writing final tree to %s" % sate_products.tree_stream.name) tree_str = job.tree.compose_newick() sate_products.tree_stream.write("%s;\n" % tree_str) #outtree_fn = options.result #if outtree_fn is None: # if options.multilocus: # outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job) # else: # outtree_fn = aln_filename + ".tre" #MESSENGER.send_info("Writing final tree to %s" % outtree_fn) #tree_str = job.tree.compose_newick() #sate_products.tree_stream.write("%s;\n" % tree_str) MESSENGER.send_info("Writing final likelihood score to %s" % sate_products.score_stream.name) sate_products.score_stream.write("%s\n" % job.score) finally: for el in prev_signals: sig, prev_handler = el if prev_handler is None: signal.signal(sig, signal.SIG_DFL) else: signal.signal(sig, prev_handler)
def finish_sate_execution(sate_team, user_config, temporaries_dir, multilocus_dataset, sate_products): global _RunningJobs # get the RAxML model #TODO: this should check for the tree_estimator. Currently we only support raxml, so this works... model = user_config.raxml.model options = user_config.commandline user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg')) if options.timesfile: f = open_with_intermediates(options.timesfile, 'a') f.close() set_timing_log_filepath(options.timesfile) ############################################################################ # We must read the incoming tree in before we call the get_sequences_for_sate # function that relabels that taxa in the dataset ###### alignment_as_tmp_filename_to_report = None tree_as_tmp_filename_to_report = None tree_file = options.treefile if tree_file: if not os.path.exists(tree_file): raise Exception('The tree file "%s" does not exist' % tree_file) tree_f = open(tree_file, 'rU') MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file) try: tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f, starting_tree=True) except KeyError: MESSENGER.send_error( "Error in reading the treefile, probably due to a name in the tree that does not match the names in the input sequence files.\n" ) raise except: MESSENGER.send_error("Error in reading the treefile.\n") raise tree_f.close() if len(tree_list) > 1: MESSENGER.send_warning( '%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file)) starting_tree = tree_list[0] score = None tree_as_tmp_filename_to_report = tree_file ############################################################################ # This will relabel the taxa if they have problematic names ##### multilocus_dataset.relabel_for_sate() ############################################################################ # This ensures all nucleotide data is DNA internally ##### restore_to_rna = False if user_config.commandline.datatype.upper() == 'RNA': multilocus_dataset.convert_rna_to_dna() user_config.commandline.datatype = 'DNA' restore_to_rna = True export_names = True if export_names: try: name_filename = sate_products.get_abs_path_for_tag( 'name_translation.txt') name_output = open(name_filename, 'w') safe2real = multilocus_dataset.safe_to_real_names safe_list = safe2real.keys() safe_list.sort() for safe in safe_list: orig = safe2real[safe][0] name_output.write("%s\n%s\n\n" % (safe, orig)) name_output.close() MESSENGER.send_info( "Name translation information saved to %s as safe name, original name, blank line format." % name_filename) except: MESSENGER.send_info( "Error exporting saving name translation to %s" % name_filename) if options.aligned: options.aligned = all([i.is_aligned() for i in multilocus_dataset]) ############################################################################ # Launch threads to do work ##### sate_config = user_config.get("sate") start_worker(sate_config.num_cpus) ############################################################################ # Be prepared to kill any long running jobs ##### prev_signals = [] for sig in [ signal.SIGTERM, signal.SIGABRT, signal.SIGINT ]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]: prev_handler = signal.signal(sig, killed_handler) prev_signals.append((sig, prev_handler)) try: if (not options.two_phase) and tree_file: # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels starting_tree_str = starting_tree.compose_newick() else: if not options.two_phase: MESSENGER.send_info( "Creating a starting tree for the SATe algorithm...") if (options.two_phase) or (not options.aligned): MESSENGER.send_info( "Performing initial alignment of the entire data matrix..." ) init_aln_dir = os.path.join(temporaries_dir, 'init_aln') init_aln_dir = sate_team.temp_fs.create_subdir(init_aln_dir) delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps) new_alignment_list = [] aln_job_list = [] for unaligned_seqs in multilocus_dataset: job = sate_team.aligner.create_job( unaligned_seqs, tmp_dir_par=init_aln_dir, context_str="initalign", delete_temps=delete_aln_temps) aln_job_list.append(job) _RunningJobs = aln_job_list for job in aln_job_list: jobq.put(job) for job in aln_job_list: new_alignment = job.get_results() new_alignment_list.append(new_alignment) _RunningJobs = None for locus_index, new_alignment in enumerate( new_alignment_list): multilocus_dataset[locus_index] = new_alignment if delete_aln_temps: sate_team.temp_fs.remove_dir(init_aln_dir) else: MESSENGER.send_info( "Input sequences assumed to be aligned (based on sequence lengths)." ) MESSENGER.send_info( "Performing initial tree search to get starting tree...") init_tree_dir = os.path.join(temporaries_dir, 'init_tree') init_tree_dir = sate_team.temp_fs.create_subdir(init_tree_dir) delete_tree_temps = not options.keeptemp job = sate_team.tree_estimator.create_job( multilocus_dataset, tmp_dir_par=init_tree_dir, num_cpus=sate_config.num_cpus, context_str="inittree", delete_temps=delete_tree_temps, sate_products=sate_products, step_num='initialsearch') _RunningJobs = job jobq.put(job) score, starting_tree_str = job.get_results() score = TransformScore(multilocus_dataset, score).execute( ) # MAN: need to transform score (ml) to our composite 5 objective score: simg, simng, sp, gap, ml _RunningJobs = None alignment_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output( "initialsearch", TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True) tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output( "initialsearch", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(init_tree_dir) _LOG.debug('We have the tree and whole_alignment, partitions...') sate_config_dict = sate_config.dict() if options.keeptemp: sate_config_dict['keep_iteration_temporaries'] = True if options.keepalignmenttemps: sate_config_dict['keep_realignment_temporaries'] = True job = SateJob( multilocus_dataset=multilocus_dataset, sate_team=sate_team, name=options.job, status_messages=MESSENGER.send_info, score=score, # MAN: to init best_score **sate_config_dict) job.tree_str = starting_tree_str job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report if score is not None: job.store_optimum_results( new_multilocus_dataset=multilocus_dataset, new_tree_str=starting_tree_str, new_score=score, curr_timestamp=time.time()) if options.two_phase: MESSENGER.send_info( "Exiting with the initial tree because the SATe algorithm is avoided when the --two-phase option is used." ) else: _RunningJobs = job MESSENGER.send_info("Starting SATe algorithm on initial tree...") job.run(tmp_dir_par=temporaries_dir, sate_products=sate_products) _RunningJobs = None if job.return_final_tree_and_alignment: alignment_as_tmp_filename_to_report = job.curr_iter_align_tmp_filename else: alignment_as_tmp_filename_to_report = job.best_alignment_tmp_filename if user_config.commandline.raxml_search_after: raxml_model = user_config.raxml.model.strip() if not raxml_model: dt = user_config.commandline.datatype mf = sate_team.tree_estimator.model ms = fasttree_to_raxml_model_str(dt, mf) sate_team.raxml_tree_estimator.model = ms rte = sate_team.raxml_tree_estimator MESSENGER.send_info( "Performing post-processing tree search in RAxML...") post_tree_dir = os.path.join(temporaries_dir, 'post_tree') post_tree_dir = sate_team.temp_fs.create_subdir(post_tree_dir) delete_tree_temps = not options.keeptemp starting_tree = None if user_config.sate.start_tree_search_from_current: starting_tree = job.tree post_job = rte.create_job(job.multilocus_dataset, starting_tree=starting_tree, num_cpus=sate_config.num_cpus, context_str="postraxtree", tmp_dir_par=post_tree_dir, delete_temps=delete_tree_temps, sate_products=sate_products, step_num="postraxtree") _RunningJobs = post_job jobq.put(post_job) post_score, post_tree = post_job.get_results() _RunningJobs = None tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output( "postraxtree", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(post_tree_dir) job.tree_str = post_tree job.score = post_score if post_score > job.best_score: job.best_tree_str = post_tree job.best_score = post_score else: if job.return_final_tree_and_alignment: tree_as_tmp_filename_to_report = job.curr_iter_tree_tmp_filename else: tree_as_tmp_filename_to_report = job.best_tree_tmp_filename ####################################################################### # Restore original taxon names and RNA characters ##### job.multilocus_dataset.restore_taxon_names() if restore_to_rna: job.multilocus_dataset.convert_dna_to_rna() user_config.commandline.datatype = 'RNA' assert len(sate_products.alignment_streams) == len( job.multilocus_dataset) for i, alignment in enumerate(job.multilocus_dataset): alignment_stream = sate_products.alignment_streams[i] MESSENGER.send_info("Writing resulting alignment to %s" % alignment_stream.name) alignment.write(alignment_stream, file_format="FASTA") alignment_stream.close() MESSENGER.send_info("Writing resulting tree to %s" % sate_products.tree_stream.name) tree_str = job.tree.compose_newick() sate_products.tree_stream.write("%s;\n" % tree_str) #outtree_fn = options.result #if outtree_fn is None: # if options.multilocus: # outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job) # else: # outtree_fn = aln_filename + ".tre" #MESSENGER.send_info("Writing resulting tree to %s" % outtree_fn) #tree_str = job.tree.compose_newick() #sate_products.tree_stream.write("%s;\n" % tree_str) MESSENGER.send_info("Writing resulting likelihood score to %s" % sate_products.score_stream.name) sate_products.score_stream.write("%s\n" % job.score) if alignment_as_tmp_filename_to_report is not None: MESSENGER.send_info( 'The resulting alignment (with the names in a "safe" form) was first written as the file "%s"' % alignment_as_tmp_filename_to_report) if tree_as_tmp_filename_to_report is not None: MESSENGER.send_info( 'The resulting tree (with the names in a "safe" form) was first written as the file "%s"' % tree_as_tmp_filename_to_report) finally: for el in prev_signals: sig, prev_handler = el if prev_handler is None: signal.signal(sig, signal.SIG_DFL) else: signal.signal(sig, prev_handler)
def run(self, tmp_dir_par, sate_products=None): assert (os.path.exists(tmp_dir_par)) self._reset_current_run_settings() self._reset_jobs() self.start_time = time.time() self.last_improvement_time = self.start_time num_non_update_iter = 0 configuration = self.configuration() # Here we check if the max_subproblem_frac is more stringent than max_subproblem_size frac_max = int(math.ceil(self.max_subproblem_frac * self.tree.n_leaves)) if frac_max > self.max_subproblem_size: configuration['max_subproblem_size'] = frac_max MESSENGER.send_info('Max subproblem set to {0}'.format( configuration['max_subproblem_size'])) if configuration['max_subproblem_size'] >= self.tree.n_leaves: MESSENGER.send_warning('''\n WARNING: you have specified a max subproblem ({0}) that is equal to or greater than the number of taxa ({0}). Thus, the SATe algorithm will not be invoked under the current configuration (i.e., no tree decomposition will occur). If you did not intend for this behavior (which you probably did not since you are using SATe) please adjust your settings for the max subproblem and try running SATe again. If you intended to use SATe to align your data with the specified aligner tool *without* any decomposition, you can ignore this message.\n'''.format(configuration['max_subproblem_size'], self.tree.n_leaves)) delete_iteration_temps = not self.keep_iteration_temporaries delete_realignment_temps = delete_iteration_temps or ( not self.keep_realignment_temporaries) configuration['delete_temps'] = delete_realignment_temps while self._keep_iterating(): record_timestamp( os.path.join(tmp_dir_par, 'start_sateiter_timestamp.txt')) # create a subdirectory for this iteration curr_iter_tmp_dir_par = os.path.join( tmp_dir_par, 'step' + str(self.current_iteration)) curr_iter_tmp_dir_par = self.sate_team.temp_fs.create_subdir( curr_iter_tmp_dir_par) _LOG.debug('directory %s created' % curr_iter_tmp_dir_par) break_strategy_index = 0 this_iter_score_improved = False while True: break_strategy = self._get_break_strategy(break_strategy_index) if not bool(break_strategy): break context_str = "iter%d-%s" % (self.current_iteration, break_strategy) # create a subdirectory for this iteration/break_strategy curr_tmp_dir_par = os.path.join(curr_iter_tmp_dir_par, break_strategy) curr_tmp_dir_par = self.sate_team.temp_fs.create_subdir( curr_tmp_dir_par) record_timestamp( os.path.join(curr_tmp_dir_par, 'start_align_timestamp.txt')) # Align (with decomposition...) self.status( 'Step %d. Realigning with decomposition strategy set to %s' % (self.current_iteration, break_strategy)) if self.killed: raise RuntimeError("SATe Job killed") tree_for_aligner = self.get_tree_copy() tree_for_aligner = self.get_tree_copy() aligner = SateAlignerJob( multilocus_dataset=self.multilocus_dataset, sate_team=self.sate_team, tree=tree_for_aligner, tmp_base_dir=curr_tmp_dir_par, reset_recursion_index=True, **configuration) self.sate_aligner_job = aligner aligner.launch_alignment(break_strategy=break_strategy, context_str=context_str) new_multilocus_dataset = aligner.get_results() self.sate_aligner_job = None del aligner record_timestamp( os.path.join(curr_tmp_dir_par, 'start_treeinference_timestamp.txt')) # Tree inference if self.start_tree_search_from_current: start_from = self.tree else: start_from = None self.status( 'Step %d. Alignment obtained. Tree inference beginning...' % (self.current_iteration)) if self.killed: raise RuntimeError("SATe Job killed") tbj = self.sate_team.tree_estimator.create_job( new_multilocus_dataset, starting_tree=start_from, num_cpus=self.num_cpus, context_str=context_str + " tree", tmp_dir_par=curr_tmp_dir_par, delete_temps=delete_iteration_temps, sate_products=sate_products, step_num=self.current_iteration) prev_curr_align = self.curr_iter_align_tmp_filename prev_curr_tree = self.curr_iter_tree_tmp_filename self.curr_iter_align_tmp_filename = sate_products.get_abs_path_for_iter_output( self.current_iteration, TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True) self.curr_iter_tree_tmp_filename = sate_products.get_abs_path_for_iter_output( self.current_iteration, TEMP_TREE_TAG, allow_existing=True) self.tree_build_job = tbj jobq.put(tbj) new_score, new_tree_str = tbj.get_results() self.tree_build_job = None del tbj if self.killed: raise RuntimeError("SATe Job killed") record_timestamp( os.path.join(curr_tmp_dir_par, 'end_treeinference_timestamp.txt')) curr_timestamp = time.time() accept_iteration = False if self.score is None: self.score = new_score if self.best_score is None or new_score > self.best_score: self.store_optimum_results(new_multilocus_dataset, new_tree_str, new_score, curr_timestamp) this_iter_score_improved = True accept_iteration = True if self._get_accept_mode( new_score=new_score, break_strategy_index=break_strategy_index ) == AcceptMode.BLIND_MODE: if self.blind_mode_is_final: self.is_stuck_in_blind = True if self.switch_to_blind_timestamp is None: if self._blindmode_trigger: _LOG.debug("Blind runmode trigger = %s" % self._blindmode_trigger) self.switch_to_blind_iter = self.current_iteration self.switch_to_blind_timestamp = curr_timestamp accept_iteration = True if accept_iteration: self.score = new_score self.multilocus_dataset = new_multilocus_dataset self.tree_str = new_tree_str if this_iter_score_improved: self.status('realignment accepted and score improved.') else: self.status( 'realignment accepted and despite the score not improving.' ) # we do not want to continue to try different breaking strategies for this iteration so we break self.status('current score: %s, best score: %s' % (self.score, self.best_score)) break else: self.status('realignment NOT accepted.') self.curr_iter_align_tmp_filename = prev_curr_align self.curr_iter_tree_tmp_filename = prev_curr_tree break_strategy_index += 1 # self.status('current score: %s, best score: %s' % (self.score, self.best_score) ) if not this_iter_score_improved: self.num_iter_since_imp += 1 self.current_iteration += 1 if self._termination_trigger: _LOG.debug("Termination trigger = %s" % self._termination_trigger) record_timestamp( os.path.join(tmp_dir_par, 'end_sateiter_timestamp.txt')) ### TODO: if configuration is 'return_final_iter_TreeAndAlignpair', then skip the following three lines if not self.return_final_tree_and_alignment: self.multilocus_dataset = self.best_multilocus_dataset.new_with_shared_meta( ) for locus_alignment in self.best_multilocus_dataset: self.multilocus_dataset.append(copy.copy(locus_alignment)) self.tree_str = self.best_tree_str self.score = self.best_score else: assert self.multilocus_dataset is not None assert self.tree_str is not None assert self.score is not None
def run(self, tmp_dir_par, sate_products=None): assert(os.path.exists(tmp_dir_par)) self._reset_current_run_settings() self._reset_jobs() self.start_time = time.time() self.last_improvement_time = self.start_time num_non_update_iter = 0 configuration = self.configuration() # Here we check if the max_subproblem_frac is more stringent than max_subproblem_size frac_max = int(math.ceil(self.max_subproblem_frac*self.tree.n_leaves)) if frac_max > self.max_subproblem_size: configuration['max_subproblem_size'] = frac_max MESSENGER.send_info('Max subproblem set to {0}'.format( configuration['max_subproblem_size'])) if configuration['max_subproblem_size'] >= self.tree.n_leaves: MESSENGER.send_warning('''\n WARNING: you have specified a max subproblem ({0}) that is equal to or greater than the number of taxa ({0}). Thus, the SATe algorithm will not be invoked under the current configuration (i.e., no tree decomposition will occur). If you did not intend for this behavior (which you probably did not since you are using SATe) please adjust your settings for the max subproblem and try running SATe again. If you intended to use SATe to align your data with the specified aligner tool *without* any decomposition, you can ignore this message.\n'''.format(configuration['max_subproblem_size'], self.tree.n_leaves)) delete_iteration_temps = not self.keep_iteration_temporaries delete_realignment_temps = delete_iteration_temps or (not self.keep_realignment_temporaries) configuration['delete_temps'] = delete_realignment_temps while self._keep_iterating(): record_timestamp(os.path.join(tmp_dir_par, 'start_sateiter_timestamp.txt')) # create a subdirectory for this iteration curr_iter_tmp_dir_par = os.path.join(tmp_dir_par, 'step' + str(self.current_iteration)) curr_iter_tmp_dir_par = self.sate_team.temp_fs.create_subdir(curr_iter_tmp_dir_par) _LOG.debug('directory %s created' % curr_iter_tmp_dir_par) break_strategy_index = 0 this_iter_score_improved = False while True: break_strategy = self._get_break_strategy(break_strategy_index) if not bool(break_strategy): break context_str = "iter%d-%s" % (self.current_iteration, break_strategy) # create a subdirectory for this iteration/break_strategy curr_tmp_dir_par = os.path.join(curr_iter_tmp_dir_par, break_strategy) curr_tmp_dir_par = self.sate_team.temp_fs.create_subdir(curr_tmp_dir_par) record_timestamp(os.path.join(curr_tmp_dir_par, 'start_align_timestamp.txt')) # Align (with decomposition...) self.status('Step %d. Realigning with decomposition strategy set to %s' % (self.current_iteration, break_strategy)) if self.killed: raise RuntimeError("SATe Job killed") tree_for_aligner = self.get_tree_copy() tree_for_aligner = self.get_tree_copy() aligner = SateAlignerJob(multilocus_dataset=self.multilocus_dataset, sate_team=self.sate_team, tree=tree_for_aligner, tmp_base_dir=curr_tmp_dir_par, reset_recursion_index=True, **configuration) self.sate_aligner_job = aligner aligner.launch_alignment(break_strategy=break_strategy, context_str=context_str) new_multilocus_dataset = aligner.get_results() self.sate_aligner_job = None del aligner record_timestamp(os.path.join(curr_tmp_dir_par, 'start_treeinference_timestamp.txt')) # Tree inference if self.start_tree_search_from_current: start_from = self.tree else: start_from = None self.status('Step %d. Alignment obtained. Tree inference beginning...' % (self.current_iteration)) if self.killed: raise RuntimeError("SATe Job killed") tbj = self.sate_team.tree_estimator.create_job(new_multilocus_dataset, starting_tree=start_from, num_cpus=self.num_cpus, context_str=context_str + " tree", tmp_dir_par=curr_tmp_dir_par, delete_temps=delete_iteration_temps, sate_products=sate_products, step_num=self.current_iteration) prev_curr_align = self.curr_iter_align_tmp_filename prev_curr_tree = self.curr_iter_tree_tmp_filename self.curr_iter_align_tmp_filename = sate_products.get_abs_path_for_iter_output(self.current_iteration, TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True) self.curr_iter_tree_tmp_filename = sate_products.get_abs_path_for_iter_output(self.current_iteration, TEMP_TREE_TAG, allow_existing=True) self.tree_build_job = tbj jobq.put(tbj) new_score, new_tree_str = tbj.get_results() self.tree_build_job = None del tbj if self.killed: raise RuntimeError("SATe Job killed") record_timestamp(os.path.join(curr_tmp_dir_par, 'end_treeinference_timestamp.txt')) curr_timestamp = time.time() accept_iteration = False if self.score is None: self.score = new_score if self.best_score is None or new_score > self.best_score: self.store_optimum_results(new_multilocus_dataset, new_tree_str, new_score, curr_timestamp) this_iter_score_improved = True accept_iteration = True if self._get_accept_mode(new_score=new_score, break_strategy_index=break_strategy_index) == AcceptMode.BLIND_MODE: if self.blind_mode_is_final: self.is_stuck_in_blind = True if self.switch_to_blind_timestamp is None: if self._blindmode_trigger: _LOG.debug("Blind runmode trigger = %s" % self._blindmode_trigger) self.switch_to_blind_iter = self.current_iteration self.switch_to_blind_timestamp = curr_timestamp accept_iteration = True if accept_iteration: self.score = new_score self.multilocus_dataset = new_multilocus_dataset self.tree_str = new_tree_str if this_iter_score_improved: self.status('realignment accepted and score improved.') else: self.status('realignment accepted and despite the score not improving.') # we do not want to continue to try different breaking strategies for this iteration so we break self.status('current score: %s, best score: %s' % (self.score, self.best_score) ) break else: self.status('realignment NOT accepted.') self.curr_iter_align_tmp_filename = prev_curr_align self.curr_iter_tree_tmp_filename = prev_curr_tree break_strategy_index += 1 # self.status('current score: %s, best score: %s' % (self.score, self.best_score) ) if not this_iter_score_improved: self.num_iter_since_imp += 1 self.current_iteration += 1 if self._termination_trigger: _LOG.debug("Termination trigger = %s" % self._termination_trigger) record_timestamp(os.path.join(tmp_dir_par, 'end_sateiter_timestamp.txt')) ### TODO: if configuration is 'return_final_iter_TreeAndAlignpair', then skip the following three lines if not self.return_final_tree_and_alignment: self.multilocus_dataset = self.best_multilocus_dataset.new_with_shared_meta() for locus_alignment in self.best_multilocus_dataset: self.multilocus_dataset.append(copy.copy(locus_alignment)) self.tree_str = self.best_tree_str self.score = self.best_score else: assert self.multilocus_dataset is not None assert self.tree_str is not None assert self.score is not None
def finish_sate_execution(sate_team, user_config, temporaries_dir, multilocus_dataset, sate_products): global _RunningJobs # get the RAxML model #TODO: this should check for the tree_estimator. Currently we only support raxml, so this works... model = user_config.raxml.model options = user_config.commandline user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg')) if options.timesfile: f = open_with_intermediates(options.timesfile, 'a') f.close() set_timing_log_filepath(options.timesfile) ############################################################################ # We must read the incoming tree in before we call the get_sequences_for_sate # function that relabels that taxa in the dataset ###### alignment_as_tmp_filename_to_report = None tree_as_tmp_filename_to_report = None tree_file = options.treefile if tree_file: if not os.path.exists(tree_file): raise Exception('The tree file "%s" does not exist' % tree_file) tree_f = open(tree_file, 'rU') MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file) try: tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f, starting_tree=True) except KeyError: MESSENGER.send_error("Error in reading the treefile, probably due to a name in the tree that does not match the names in the input sequence files.\n") raise except: MESSENGER.send_error("Error in reading the treefile.\n") raise tree_f.close() if len(tree_list) > 1: MESSENGER.send_warning('%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file)) starting_tree = tree_list[0] score = None tree_as_tmp_filename_to_report = tree_file ############################################################################ # This will relabel the taxa if they have problematic names ##### multilocus_dataset.relabel_for_sate() ############################################################################ # This ensures all nucleotide data is DNA internally ##### restore_to_rna = False if user_config.commandline.datatype.upper() == 'RNA': multilocus_dataset.convert_rna_to_dna() user_config.commandline.datatype = 'DNA' restore_to_rna = True export_names = True if export_names: try: name_filename = sate_products.get_abs_path_for_tag('name_translation.txt') name_output = open(name_filename, 'w') safe2real = multilocus_dataset.safe_to_real_names safe_list = safe2real.keys() safe_list.sort() for safe in safe_list: orig = safe2real[safe][0] name_output.write("%s\n%s\n\n" % (safe, orig)) name_output.close() MESSENGER.send_info("Name translation information saved to %s as safe name, original name, blank line format." % name_filename) except: MESSENGER.send_info("Error exporting saving name translation to %s" % name_filename) if options.aligned: options.aligned = all( [i.is_aligned() for i in multilocus_dataset] ) ############################################################################ # Launch threads to do work ##### sate_config = user_config.get("sate") start_worker(sate_config.num_cpus) ############################################################################ # Be prepared to kill any long running jobs ##### prev_signals = [] for sig in [signal.SIGTERM, signal.SIGABRT, signal.SIGINT]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]: prev_handler = signal.signal(sig, killed_handler) prev_signals.append((sig, prev_handler)) try: if (not options.two_phase) and tree_file: # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels starting_tree_str = starting_tree.compose_newick() else: if not options.two_phase: MESSENGER.send_info("Creating a starting tree for the SATe algorithm...") if (options.two_phase) or (not options.aligned): MESSENGER.send_info("Performing initial alignment of the entire data matrix...") init_aln_dir = os.path.join(temporaries_dir, 'init_aln') init_aln_dir = sate_team.temp_fs.create_subdir(init_aln_dir) delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps) new_alignment_list= [] aln_job_list = [] for unaligned_seqs in multilocus_dataset: job = sate_team.aligner.create_job(unaligned_seqs, tmp_dir_par=init_aln_dir, context_str="initalign", delete_temps=delete_aln_temps) aln_job_list.append(job) _RunningJobs = aln_job_list for job in aln_job_list: jobq.put(job) for job in aln_job_list: new_alignment = job.get_results() new_alignment_list.append(new_alignment) _RunningJobs = None for locus_index, new_alignment in enumerate(new_alignment_list): multilocus_dataset[locus_index] = new_alignment if delete_aln_temps: sate_team.temp_fs.remove_dir(init_aln_dir) else: MESSENGER.send_info("Input sequences assumed to be aligned (based on sequence lengths).") MESSENGER.send_info("Performing initial tree search to get starting tree...") init_tree_dir = os.path.join(temporaries_dir, 'init_tree') init_tree_dir = sate_team.temp_fs.create_subdir(init_tree_dir) delete_tree_temps = not options.keeptemp job = sate_team.tree_estimator.create_job(multilocus_dataset, tmp_dir_par=init_tree_dir, num_cpus=sate_config.num_cpus, context_str="inittree", delete_temps=delete_tree_temps, sate_products=sate_products, step_num='initialsearch') _RunningJobs = job jobq.put(job) score, starting_tree_str = job.get_results() _RunningJobs = None alignment_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output("initialsearch", TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True) tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output("initialsearch", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(init_tree_dir) _LOG.debug('We have the tree and whole_alignment, partitions...') sate_config_dict = sate_config.dict() if options.keeptemp: sate_config_dict['keep_iteration_temporaries'] = True if options.keepalignmenttemps: sate_config_dict['keep_realignment_temporaries'] = True job = SateJob(multilocus_dataset=multilocus_dataset, sate_team=sate_team, name=options.job, status_messages=MESSENGER.send_info, score=score, **sate_config_dict) job.tree_str = starting_tree_str job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report if score is not None: job.store_optimum_results(new_multilocus_dataset=multilocus_dataset, new_tree_str=starting_tree_str, new_score=score, curr_timestamp=time.time()) if options.two_phase: MESSENGER.send_info("Exiting with the initial tree because the SATe algorithm is avoided when the --two-phase option is used.") else: _RunningJobs = job MESSENGER.send_info("Starting SATe algorithm on initial tree...") job.run(tmp_dir_par=temporaries_dir, sate_products=sate_products) _RunningJobs = None if job.return_final_tree_and_alignment: alignment_as_tmp_filename_to_report = job.curr_iter_align_tmp_filename else: alignment_as_tmp_filename_to_report = job.best_alignment_tmp_filename if user_config.commandline.raxml_search_after: raxml_model = user_config.raxml.model.strip() if not raxml_model: dt = user_config.commandline.datatype mf = sate_team.tree_estimator.model ms = fasttree_to_raxml_model_str(dt, mf) sate_team.raxml_tree_estimator.model = ms rte = sate_team.raxml_tree_estimator MESSENGER.send_info("Performing post-processing tree search in RAxML...") post_tree_dir = os.path.join(temporaries_dir, 'post_tree') post_tree_dir = sate_team.temp_fs.create_subdir(post_tree_dir) delete_tree_temps = not options.keeptemp starting_tree = None if user_config.sate.start_tree_search_from_current: starting_tree = job.tree post_job = rte.create_job(job.multilocus_dataset, starting_tree=starting_tree, num_cpus=sate_config.num_cpus, context_str="postraxtree", tmp_dir_par=post_tree_dir, delete_temps=delete_tree_temps, sate_products=sate_products, step_num="postraxtree") _RunningJobs = post_job jobq.put(post_job) post_score, post_tree = post_job.get_results() _RunningJobs = None tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output("postraxtree", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(post_tree_dir) job.tree_str = post_tree job.score = post_score if post_score > job.best_score: job.best_tree_str = post_tree job.best_score = post_score else: if job.return_final_tree_and_alignment: tree_as_tmp_filename_to_report = job.curr_iter_tree_tmp_filename else: tree_as_tmp_filename_to_report = job.best_tree_tmp_filename ####################################################################### # Restore original taxon names and RNA characters ##### job.multilocus_dataset.restore_taxon_names() if restore_to_rna: job.multilocus_dataset.convert_dna_to_rna() user_config.commandline.datatype = 'RNA' assert len(sate_products.alignment_streams) == len(job.multilocus_dataset) for i, alignment in enumerate(job.multilocus_dataset): alignment_stream = sate_products.alignment_streams[i] MESSENGER.send_info("Writing resulting alignment to %s" % alignment_stream.name) alignment.write(alignment_stream, file_format="FASTA") alignment_stream.close() MESSENGER.send_info("Writing resulting tree to %s" % sate_products.tree_stream.name) tree_str = job.tree.compose_newick() sate_products.tree_stream.write("%s;\n" % tree_str) #outtree_fn = options.result #if outtree_fn is None: # if options.multilocus: # outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job) # else: # outtree_fn = aln_filename + ".tre" #MESSENGER.send_info("Writing resulting tree to %s" % outtree_fn) #tree_str = job.tree.compose_newick() #sate_products.tree_stream.write("%s;\n" % tree_str) MESSENGER.send_info("Writing resulting likelihood score to %s" % sate_products.score_stream.name) sate_products.score_stream.write("%s\n" % job.score) if alignment_as_tmp_filename_to_report is not None: MESSENGER.send_info('The resulting alignment (with the names in a "safe" form) was first written as the file "%s"' % alignment_as_tmp_filename_to_report) if tree_as_tmp_filename_to_report is not None: MESSENGER.send_info('The resulting tree (with the names in a "safe" form) was first written as the file "%s"' % tree_as_tmp_filename_to_report) finally: for el in prev_signals: sig, prev_handler = el if prev_handler is None: signal.signal(sig, signal.SIG_DFL) else: signal.signal(sig, prev_handler)