def create_scheme(cfg, scheme_name, scheme_description): """ Generate a single scheme given a list of numbers that represent the indexes of the partitions e.g. [0,1,2,3,4,5,6,7] """ subset_count = len(cfg.user_subsets) # Check that the correct number of items are in the list if len(scheme_description) != subset_count: log.error("There's a problem with the description of scheme %s" % scheme_name) raise SchemeError # Now generate the pattern subs = {} # We use the numbers returned to group the different subsets for sub_index, grouping in enumerate(scheme_description): insub = subs.setdefault(grouping, []) insub.append(sub_index) # We now have what we need to create a subset. Each entry will have a # set of values which are the index for the partition created_subsets = [] for sub_indexes in subs.values(): subs_to_merge = [cfg.user_subsets[i] for i in sub_indexes] sub = subset_ops.merge_subsets(subs_to_merge) created_subsets.append(sub) return Scheme(cfg, str(scheme_name), created_subsets, description=scheme_description)
def generate_all_schemes(cfg): """ Convert the abstract schema given by the algorithm into subsets """ log.info("Generating all possible schemes for the partitions...") subset_count = len(cfg.user_subsets) # Now generate the pattern for this many partitions all_schemes = submodels.get_submodels(subset_count) scheme_name = 1 scheme_list = [] for scheme in all_schemes: subs = {} # We use the numbers returned to group the different subsets for sub_index, grouping in enumerate(scheme): insub = subs.setdefault(grouping, []) insub.append(sub_index) # We now have what we need to create a subset. Each entry will have a # set of values which are the index for the partition created_subsets = [] for sub_indexes in subs.values(): sub = subset_ops.merge_subsets( [cfg.user_subsets[i] for i in sub_indexes]) created_subsets.append(sub) scheme_list.append(Scheme(cfg, str(scheme_name), created_subsets)) log.debug("Created scheme %d of %d" % (scheme_name, len(all_schemes))) scheme_name += 1 return scheme_list
def clean_scheme(self, start_scheme): # Here we look for and fix up subsets that are too small or don't have all states keep_going = 1 merges = 0 if keep_going == 1: with logtools.indented( log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" % start_scheme.name): while keep_going > 0: subsets = [s for s in start_scheme.subsets] # sort the subsets, to keep results consistent over re-runs subsets.sort(key=lambda x: 1.0 / float(len(x.columns))) # run through all subsets for i, sub in enumerate(subsets): found = 0 state_problems = self.alignment.check_state_probs( sub, the_config) if (len(sub.columns) < the_config.min_subset_size or state_problems == True): # merge that subset with nearest neighbour new_pair = neighbour.get_closest_subset( sub, subsets, the_config) log.info( "Subset '%s' will be merged with subset '%s'" % (new_pair[0].name, new_pair[1].name)) new_pair_merged = subset_ops.merge_subsets( new_pair) start_scheme = neighbour.make_clustered_scheme( start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config) the_config.progress.begin(1, 1) self.analyse_scheme(start_scheme) subsets = [s for s in start_scheme.subsets] merges = merges + 1 found = 1 break # if we got to here, there were no subsets to merge if found == 0: keep_going = 0 if len(subsets) == 1: log.error( "The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again" ) raise AnalysisError log.info( "%d subsets merged because of --min-subset-size and/or --all-states settings" % merges) return (start_scheme)
def define_subset_grouping(self, text, loc, subset_def): """These define initial groupings that users think are useful """ try: # Get the partitions from the names subsets = [self.cfg.user_subsets_by_name[nm] for nm in subset_def[0]] # Keep a running list of these till we define the schema below self.current_subsets.append(subset_ops.merge_subsets(subsets)) except subset.SubsetError: raise ParserError(text, loc, "Error creating subset...")
def make_clustered_scheme(start_scheme, scheme_name, subsets_to_cluster, cfg): # 1. Create a new subset that merges the subsets_to_cluster merged_sub = subset_ops.merge_subsets(subsets_to_cluster) # 2. Then we define a new scheme with those merged subsets new_subsets = start_scheme.subsets - set(subsets_to_cluster) new_subsets.add(merged_sub) #3. Create the clustered scheme final_scheme = scheme.Scheme(cfg, str(scheme_name), new_subsets) return final_scheme
def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets(self.cfg.user_subsets) self.filtered_alignment = SubsetAlignment( self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join( self.cfg.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Now we've written this alignment, we need to lock everything in # place, no more adding partitions, or changing them from now on. # TODO: This checking should still be done... # self.cfg.partitions.check_against_alignment(self.alignment) # self.cfg.partitions.finalise() # We start by copying the alignment self.alignment_path = os.path.join( self.cfg.start_tree_path, 'source.phy') # Now check for the tree tree_path = self.cfg.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(self.cfg.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s", user_path) topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy') self.cfg.processor.dupfile(user_path, topology_path) else: log.debug( "didn't find tree at %s, making a new one" % tree_path) topology_path = self.cfg.processor.make_topology( self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras) # Now estimate branch lengths tree_path = self.cfg.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, self.cfg.datatype, self.cfg.cmdline_extras) self.tree_path = tree_path log.info("Starting tree with branch lengths is here: %s", self.tree_path)
def clean_scheme(self, start_scheme): # Here we look for and fix up subsets that are too small or don't have all states keep_going = 1 merges = 0 if keep_going == 1: with logtools.indented(log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" %start_scheme.name): while keep_going > 0: subsets = [s for s in start_scheme.subsets] # sort the subsets, to keep results consistent over re-runs subsets.sort(key = lambda x: 1.0/float(len(x.columns))) # run through all subsets for i, sub in enumerate(subsets): found = 0 state_problems = self.alignment.check_state_probs(sub, the_config) if ( len(sub.columns) < the_config.min_subset_size or state_problems == True ): # merge that subset with nearest neighbour new_pair = neighbour.get_closest_subset(sub, subsets, the_config) log.info("Subset '%s' will be merged with subset '%s'" %(new_pair[0].name, new_pair[1].name)) new_pair_merged = subset_ops.merge_subsets(new_pair) start_scheme = neighbour.make_clustered_scheme( start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config) the_config.progress.begin(1, 1) self.analyse_scheme(start_scheme) subsets = [s for s in start_scheme.subsets] merges = merges + 1 found = 1 break # if we got to here, there were no subsets to merge if found == 0: keep_going = 0 if len(subsets) == 1: log.error("The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again") raise AnalysisError log.info("%d subsets merged because of --min-subset-size and/or --all-states settings" % merges) return(start_scheme)
def model_to_scheme(model, scheme_name, cfg): """Turn a model definition e.g. [0, 1, 2, 3, 4] into a scheme""" subs = {} # We use the numbers returned to group the different subsets for sub_index, grouping in enumerate(model): insub = subs.setdefault(grouping, []) insub.append(sub_index) # We now have what we need to create a subset. Each entry will have a # set of values which are the index for the partition created_subsets = [] for sub_indexes in subs.values(): subs_to_merge = [cfg.user_subsets[i] for i in sub_indexes] sub = subset_ops.merge_subsets(subs_to_merge) created_subsets.append(sub) return Scheme(cfg, str(scheme_name), created_subsets)
def get_nearest_neighbour_scheme(start_scheme, scheme_name, cfg): """ The idea here is to take a scheme, and perform some analyses to find a neighbouring scheme, where the neighbour has one less subset than the current scheme. Really this is just progressive clustering, but specified to work well with PartitionFinder """ # we use [0] becuase the function returns a ranked list of lists of length 1 subsets = [s for s in start_scheme.subsets] closest_subsets = get_N_closest_subsets(subsets, cfg, 1)[0] merged_sub = subset_ops.merge_subsets(closest_subsets) scheme = make_clustered_scheme(start_scheme, scheme_name, closest_subsets, merged_sub, cfg) return scheme
def get_nearest_neighbour_scheme(start_scheme, scheme_name, cfg): """ The idea here is to take a scheme, and perform some analyses to find a neighbouring scheme, where the neighbour has one less subset than the current scheme. Really this is just progressive clustering, but specified to work well with PartitionFinder """ # we use [0] becuase the function returns a ranked list of lists of length 1 subsets = [s for s in start_scheme.subsets] closest_subsets = get_N_closest_subsets(subsets, cfg, 1)[0] merged_sub = subset_ops.merge_subsets(closest_subsets) scheme = make_clustered_scheme( start_scheme, scheme_name, closest_subsets, merged_sub, cfg) return scheme
def reassign_invariant_sites(self, subsets): #TODO add a skip: #if(len(subsets)==1): # return(subsets) # get entropies for whole alignment for this subset onesub = subset_ops.merge_subsets(subsets) entropies = entropy.sitewise_entropies( SubsetAlignment(self.alignment, onesub)) # find nearest site for each invariant site # replacements is a dict of: key: invariant col; value: replacement col, # e.g. # {512: 513, 514: 513, 515: 513, 516: 517} replacements = entropy.get_replacement_sites(entropies, onesub.columns) # now make a dict of the CURRENT subsets: key: site; value: subset sch_dict = {} for i, sub in enumerate(subsets): for site in sub.columns: sch_dict[site] = i # then reassign the sites as necessary based on replacements for r in replacements: sch_dict[r] = sch_dict[replacements[r]] # now build subsets according to the new sites sub_dict = {} # this gives us the subsets to build for k, v in sch_dict.iteritems(): sub_dict.setdefault(v, []).append(k) new_subsets = [] for s in sub_dict: n = Subset(the_config, set(sub_dict[s])) new_subsets.append(n) return (new_subsets)
def reassign_invariant_sites(self, subsets): #TODO add a skip: #if(len(subsets)==1): # return(subsets) # get entropies for whole alignment for this subset onesub = subset_ops.merge_subsets(subsets) entropies = entropy.sitewise_entropies(SubsetAlignment(self.alignment, onesub)) # find nearest site for each invariant site # replacements is a dict of: key: invariant col; value: replacement col, # e.g. # {512: 513, 514: 513, 515: 513, 516: 517} replacements = entropy.get_replacement_sites(entropies, onesub.columns) # now make a dict of the CURRENT subsets: key: site; value: subset sch_dict = {} for i, sub in enumerate(subsets): for site in sub.columns: sch_dict[site] = i # then reassign the sites as necessary based on replacements for r in replacements: sch_dict[r] = sch_dict[replacements[r]] # now build subsets according to the new sites sub_dict = {} # this gives us the subsets to build for k, v in sch_dict.iteritems(): sub_dict.setdefault(v, []).append(k) new_subsets = [] for s in sub_dict: n = Subset(the_config, set(sub_dict[s])) new_subsets.append(n) return(new_subsets)
def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets( the_config.user_subsets) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(the_config.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Check the full subset against the alignment subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config) # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') # Now check for the tree tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(the_config.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s" % user_path) topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy') util.dupfile(user_path, topology_path) need_bl = True elif the_config.no_ml_tree == True: log.debug("didn't find tree at %s, making a new one" % tree_path) topology_path = the_config.processor.make_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras) need_bl = True elif the_config.no_ml_tree == False: log.debug( "didn't find tree at %s, making an ML tree with RAxML" % tree_path) tree_scheme = scheme.create_scheme( the_config, "tree_scheme", range(len(the_config.user_subsets))) topology_path = raxml.make_ml_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads) # here we copy the ML tree topology so it can be used with PhyML too # TODO: this is a hack, and it would be better to decide on a universal # name for the different types of tree we might have. phyml_tree = os.path.join( os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt") copyfile(topology_path, phyml_tree) need_bl = False if need_bl == True: # Now estimate branch lengths tree_path = the_config.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, the_config.datatype, the_config.cmdline_extras) self.tree_path = tree_path log.debug("Starting tree with branch lengths is here: %s" % self.tree_path)
def finalise_fabrication(self, start_subsets, step): fabricated_subsets = [] for s in start_subsets: # here we put a sensible lower limit on the size of subsets if len(s.columns) < the_config.min_subset_size: s.fabricated = True log.debug("Subset %s with only %d sites found" % (s.subset_id, len(s.columns))) # here we can test if the alignment has all states: state_probs = self.alignment.check_state_probs(s, the_config) if state_probs: s.fabricated = True log.debug( "Subset %s does not have all states in the alignment", s.subset_id) if s.fabricated: fabricated_subsets.append(s) log.debug("added %s to fabricated subset", s.name) if fabricated_subsets: with logtools.indented(log, "Finalising partitioning scheme"): log.debug("There are %d/%d fabricated subsets" % (len(fabricated_subsets), len(start_subsets))) i = 1 while fabricated_subsets: all_subs = start_subsets # occasionally subsets with all value == 0.0 are given a # centroid of None by scikit-learn. The true entropy here # is 0.0 for all sites, so the true centroid is 0.0 for s in all_subs: if s.centroid == None: s.centroid = [0.0] log.debug("Fixed a subset with a centroid of None") log.debug("The subset has %d columns" % len(s.columns)) s = fabricated_subsets.pop(0) log.debug("Working on fabricated subset %s with %d sites" % (s.subset_id, len(s.columns))) log.info("Finalising subset %d", i) i = i + 1 all_subs.remove(s) centroid = s.centroid best_match = None # get closest subset to s for sub in all_subs: centroid_array = [sub.centroid, centroid] euclid_dist = spatial.distance.pdist(centroid_array) if euclid_dist < best_match or best_match is None: best_match = euclid_dist closest_sub = sub # join s with closest_sub to make joined_sub merged_sub = subset_ops.merge_subsets([s, closest_sub]) # remove closest sub all_subs.remove(closest_sub) # and if closest_sub was fabricated too, we remove it here if fabricated_subsets.count(closest_sub): fabricated_subsets.remove(closest_sub) # analyse joined sub self.analyse_list_of_subsets([merged_sub]) # here we put a sensible lower limit on the size of subsets if len(merged_sub.columns) < the_config.min_subset_size: merged_sub.fabricated = True # if joined has to be fabricated, add to fabricated list if merged_sub.fabricated: fabricated_subsets.append(merged_sub) all_subs.append(merged_sub) else: all_subs = start_subsets # now build a scheme from start_subs, and it should work final_scheme = scheme.Scheme(the_config, "final_scheme", all_subs) # return final scheme return final_scheme
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented( log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix( subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int( math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix < 0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair( c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix( c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info( "The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info( "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim * dim) - dim)) / 2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info( "The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) if the_config.cluster_max == -987654321: the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))]) log.info("Set rcluster-max to %d" %the_config.cluster_max) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix(subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change>=0: log.info("Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix<0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim*dim)-dim))/2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change>=0: log.info("Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.info("The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def finalise_fabrication(self, start_subsets, step): fabricated_subsets = [] for s in start_subsets: # here we put a sensible lower limit on the size of subsets if len(s.columns) < the_config.min_subset_size: s.fabricated = True log.debug("Subset %s with only %d sites found" %(s.subset_id, len(s.columns))) # here we can test if the alignment has all states: state_probs = self.alignment.check_state_probs(s, the_config) if state_probs: s.fabricated = True log.debug("Subset %s does not have all states in the alignment", s.subset_id) if s.fabricated: fabricated_subsets.append(s) log.debug("added %s to fabricated subset", s.name) if fabricated_subsets: with logtools.indented(log, "Finalising partitioning scheme"): log.debug("There are %d/%d fabricated subsets" % (len(fabricated_subsets), len(start_subsets))) i = 1 while fabricated_subsets: all_subs = start_subsets # occasionally subsets with all value == 0.0 are given a # centroid of None by scikit-learn. The true entropy here # is 0.0 for all sites, so the true centroid is 0.0 for s in all_subs: if s.centroid == None: s.centroid = [0.0] log.debug("Fixed a subset with a centroid of None") log.debug("The subset has %d columns" % len(s.columns)) s = fabricated_subsets.pop(0) log.debug("Working on fabricated subset %s with %d sites" %(s.subset_id, len(s.columns))) log.info("Finalising subset %d", i) i = i+1 all_subs.remove(s) centroid = s.centroid best_match = None # get closest subset to s for sub in all_subs: centroid_array = [sub.centroid, centroid] euclid_dist = spatial.distance.pdist(centroid_array) if euclid_dist < best_match or best_match is None: best_match = euclid_dist closest_sub = sub # join s with closest_sub to make joined_sub merged_sub = subset_ops.merge_subsets([s, closest_sub]) # remove closest sub all_subs.remove(closest_sub) # and if closest_sub was fabricated too, we remove it here if fabricated_subsets.count(closest_sub): fabricated_subsets.remove(closest_sub) # analyse joined sub self.analyse_list_of_subsets([merged_sub]) # here we put a sensible lower limit on the size of subsets if len(merged_sub.columns)<the_config.min_subset_size: merged_sub.fabricated = True # if joined has to be fabricated, add to fabricated list if merged_sub.fabricated: fabricated_subsets.append(merged_sub) all_subs.append(merged_sub) else: all_subs = start_subsets # now build a scheme from start_subs, and it should work final_scheme = scheme.Scheme(the_config, "final_scheme", all_subs) # return final scheme return final_scheme
def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets) self.filtered_alignment = SubsetAlignment( self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join( the_config.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Check the full subset against the alignment subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config) # We start by copying the alignment self.alignment_path = os.path.join( the_config.start_tree_path, 'source.phy') # Now check for the tree tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(the_config.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s" % user_path) topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy') util.dupfile(user_path, topology_path) need_bl = True elif the_config.no_ml_tree == True: log.debug( "didn't find tree at %s, making a new one" % tree_path) topology_path = the_config.processor.make_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras) need_bl = True elif the_config.no_ml_tree == False: log.debug( "didn't find tree at %s, making an ML tree with RAxML" % tree_path) tree_scheme = scheme.create_scheme( the_config, "tree_scheme", range(len(the_config.user_subsets))) topology_path = raxml.make_ml_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads) # here we copy the ML tree topology so it can be used with PhyML too # TODO: this is a hack, and it would be better to decide on a universal # name for the different types of tree we might have. phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt") copyfile(topology_path, phyml_tree) need_bl = False if need_bl == True: # Now estimate branch lengths tree_path = the_config.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, the_config.datatype, the_config.cmdline_extras) self.tree_path = tree_path log.debug("Starting tree with branch lengths is here: %s" % self.tree_path)