def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # TODO REMOVE -- this should be part of the checking procedure # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("""Alignment file has changed since previous run. You need to use the force-restart option.""") raise AnalysisError compare = lambda x, y: collections.Counter(x) == collections.Counter(y) if not compare(old_align.species, self.alignment.species): log.error("""Species names in alignment have changed since previous run. You need to use the force-restart option.""") raise AnalysisError else: self.alignment.write(self.alignment_path)
def make_alignment(self, cfg, alignment): # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(alignment, self) sub_path = os.path.join(cfg.phyml_path, self.name + '.phy') # Add it into the sub, so we keep it around self.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s", sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error( "It looks like you have changed one or more of the " "data_blocks in the configuration file, " "so the new subset alignments " "don't match the ones stored for this analysis. " "You'll need to run the program with --force-restart") raise SubsetError else: # We need to write it sub_alignment.write(sub_path)
def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # TODO REMOVE -- this should be part of the checking procedure # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("""Alignment file has changed since previous run. You need to use the force-restart option.""") raise AnalysisError compare = lambda x, y: collections.Counter( x) == collections.Counter(y) if not compare(old_align.species, self.alignment.species): log.error( """Species names in alignment have changed since previous run. You need to use the force-restart option.""") raise AnalysisError else: self.alignment.write(self.alignment_path)
def make_alignment(self, cfg, alignment): # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(alignment, self) sub_path = os.path.join(cfg.phylofiles_path, self.name + '.phy') # Add it into the sub, so we keep it around self.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s", sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error( "It looks like you have changed one or more of the " "data_blocks in the configuration file, " "so the new subset alignments " "don't match the ones stored for this analysis. " "You'll need to run the program with --force-restart") raise SubsetError else: # We need to write it sub_alignment.write(sub_path)
def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("Alignment file has changed since previous run. You need to use the force-restart option.") raise AnalysisError else: self.alignment.write(self.alignment_path)
def make_alignment(self, cfg, alignment): # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(alignment, self) sub_path = os.path.join(cfg.phylofiles_path, self.name + '.phy') # Add it into the sub, so we keep it around self.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s", sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error(self.FORCE_RESTART_MESSAGE) raise SubsetError else: # We need to write it sub_alignment.write(sub_path)
def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error( "Alignment file has changed since previous run. You need to use the force-restart option." ) raise AnalysisError else: self.alignment.write(self.alignment_path)
def make_alignment(self, cfg, alignment): # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(alignment, self) sub_path = os.path.join(cfg.phylofiles_path, self.subset_id + '.phy') # Add it into the sub, so we keep it around self.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s" % sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error(self.FORCE_RESTART_MESSAGE) raise SubsetError else: # We need to write it sub_alignment.write(sub_path)
def analyse_subset(self, sub, models): """Analyse the subset using the models given This is the core place where everything comes together The results are placed into subset.result """ log.debug("About to analyse %s using models %s", sub, ", ".join(list(models))) #keep people informed about what's going on #if we don't know the total subset number, we can usually get it like this if self.total_subset_num == None: self.total_subset_num = len(sub._cache) old_num_analysed = self.subsets_analysed self.subsets_analysed_set.add(sub.name) self.subsets_analysed = len(self.subsets_analysed_set) if self.subsets_analysed>old_num_analysed: #we've just analysed a subset we haven't seen yet percent_done = float(self.subsets_analysed)*100.0/float(self.total_subset_num) log.info("Analysing subset %d/%d: %.2f%s done" %(self.subsets_analysed,self.total_subset_num, percent_done, r"%")) subset_cache_path = os.path.join(self.cfg.subsets_path, sub.name + '.bin') # We might have already saved a bunch of results, try there first if not sub.results: log.debug("Reading in cached data from the subsets file") sub.read_cache(subset_cache_path) # First, see if we've already got the results loaded. Then we can # shortcut all the other checks models_done = set(sub.results.keys()) log.debug("These models have already been done: %s", models_done) models_required = set(models) models_to_do = models_required - models_done log.debug("Which leaves these models still to analyse: %s", models_to_do) # Empty set means we're done if not models_to_do: log.debug("All models already done, so using just the cached results for subset %s", sub) #if models_done!=set(models): #redo model selection if we have different models sub.model_selection(self.cfg.model_selection, self.cfg.models) return # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(self.alignment, sub) sub_path = os.path.join(self.cfg.phyml_path, sub.name + '.phy') # Add it into the sub, so we keep it around sub.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s", sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error("It looks like you have changed one or more of the" "data_blocks in the configuration file, " "so the new subset alignments" " don't match the ones stored for this analysis." "You'll need to run the program with --force-restart") raise AnalysisError else: # We need to write it sub_alignment.write(sub_path) # Try and read in some previous analyses log.debug("Checking for old results in the phyml folder") self.parse_results(sub, models_to_do) if not models_to_do: #if models_done!=set(models): #redo model selection if we have different models sub.model_selection(self.cfg.model_selection, self.cfg.models) return # What is left, we actually have to analyse... tasks = [] #for efficiency, we rank the models by their difficulty - most difficult first difficulty = [] for m in models_to_do: difficulty.append(get_model_difficulty(m)) #hat tip to http://scienceoss.com/sort-one-list-by-another-list/ difficulty_and_m = zip(difficulty, models_to_do) difficulty_and_m.sort(reverse=True) sorted_difficulty, sorted_models_to_do = zip(*difficulty_and_m) log.debug("About to analyse these models, in this order: %s", sorted_models_to_do) for m in sorted_models_to_do: #a_path, out_path = phyml.make_analysis_path(self.cfg.phyml_path, sub.name, m) tasks.append((phyml.analyse, (m, sub_path, self.tree_path, self.cfg.branchlengths))) if self.threads == 1: self.run_models_concurrent(tasks) else: self.run_models_threaded(tasks) # Now parse the models we've just done self.parse_results(sub, models_to_do) # This should be empty NOW! if models_to_do: log.error("Failed to run models %s; not sure why", ", ".join(list(models_to_do))) raise AnalysisError # Now we have analysed all models for this subset, we do model selection # but ONLY on the models specified in the cfg file. sub.model_selection(self.cfg.model_selection, self.cfg.models) # If we made it to here, we should write out the new summary self.rpt.write_subset_summary(sub) # We also need to update this sub.write_cache(subset_cache_path)