def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.partitions) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) # Get a list of all possible lumpings of the best_scheme lumpings = algorithm.lumpings(start_description) # Save the current best score we have in results old_best_score = self.results.best_score for lumped_description in lumpings: lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description) cur_s += 1 # This is just checking to see if a scheme is any good, if it # is, we remember and write it later self.analyse_scheme(lumped_scheme) # Did out best score change (It ONLY gets better -- see in # results.py) if self.results.best_score == old_best_score: # It didn't, so we're done break # Let's look further. We use the description from our best scheme # (which will be the one that just changed in the last lumpings # iteration) start_description = self.results.best_result.scheme.description # Rename and record the best scheme for this step self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # If it's the scheme with everything equal, quit if len(set(start_description)) == 1: break # Go do the next round... step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) txt = "Best scheme according to Greedy algorithm, analysed with %s" % self.cfg.model_selection self.cfg.reporter.write_best_scheme(txt, self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim*dim)-dim))/2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change>=0: log.info("Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.info("The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim * dim) - dim)) / 2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info( "The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): """A greedy algorithm for heuristic partitioning searches""" log.info("Performing greedy analysis") models = self.cfg.models model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) self.total_scheme_num = submodels.count_greedy_schemes(partnum) log.info("This will result in a maximum of %s schemes being created", self.total_scheme_num) self.total_subset_num = submodels.count_greedy_subsets(partnum) log.info( "PartitionFinder will have to analyse a maximum of %d subsets of sites to complete this analysis" % (self.total_subset_num) ) if self.total_subset_num > 10000: log.warning("%d is a lot of subsets, this might take a long time to analyse", self.total_subset_num) log.warning("Perhaps consider using a different search scheme instead (see Manual)") # clear any schemes that are currently loaded # TODO Not sure we need this... self.cfg.schemes.clear_schemes() # start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, 1, start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) result = self.analyse_scheme(start_scheme, models) def get_score(my_result): # TODO: this is bad. Should use self.cfg.model_selection, or write # a new model_selection for scheme.py if model_selection == "aic": score = my_result.aic elif model_selection == "aicc": score = my_result.aicc elif model_selection == "bic": score = my_result.bic else: log.error("Unrecognised model_selection variable '%s', please check" % (score)) raise AnalysisError return score best_result = result best_score = get_score(result) step = 1 cur_s = 2 # now we try out all lumpings of the current scheme, to see if we can find a better one # and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) # get a list of all possible lumpings of the best_scheme lumpings = algorithm.lumpings(start_description) # we reset the counters as we go, for better user information self.total_scheme_num = len(lumpings) self.schemes_analysed = 0 best_lumping_score = None for lumped_description in lumpings: lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description) cur_s += 1 result = self.analyse_scheme(lumped_scheme, models) new_score = get_score(result) if best_lumping_score == None or new_score < best_lumping_score: best_lumping_score = new_score best_lumping_result = result best_lumping_scheme = lumped_scheme best_lumping_desc = lumped_description if best_lumping_score < best_score: best_scheme = best_lumping_scheme best_score = best_lumping_score best_result = best_lumping_result start_description = best_lumping_desc if len(set(best_lumping_desc)) == 1: # then it's the scheme with everything equal, so quit break step += 1 else: break log.info("Greedy algorithm finished after %d steps" % step) log.info( "Highest scoring scheme is scheme %s, with %s score of %.3f" % (best_result.scheme.name, model_selection, best_score) ) self.best_result = best_result
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.partitions) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) # Get a list of all possible lumpings of the best_scheme lumpings = algorithm.lumpings(start_description) # Save the current best score we have in results old_best_score = self.results.best_score for lumped_description in lumpings: lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description) cur_s += 1 # This is just checking to see if a scheme is any good, if it # is, we remember and write it later self.analyse_scheme(lumped_scheme) # Did out best score change (It ONLY gets better -- see in # results.py) if self.results.best_score == old_best_score: # It didn't, so we're done break # Let's look further. We use the description from our best scheme # (which will be the one that just changed in the last lumpings # iteration) start_description = self.results.best_result.scheme.description # Rename and record the best scheme for this step self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary(self.results.best_scheme, self.results.best_result) # If it's the scheme with everything equal, quit if len(set(start_description)) == 1: break # Go do the next round... step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): # Copied and pasted from greedy analysis partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) old_score = self.analyse_scheme(start_scheme) # Get first scheme best_scheme = start_scheme subset_index = 0 all_subsets = list(best_scheme.subsets) processor = self.cfg.processor alignment_path = self.filtered_alignment_path tree_path = processor.make_tree_path(alignment_path) while subset_index < len(all_subsets): current_subset = all_subsets[subset_index] split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path) if split_subsets == 1: subset_index += 1 else: # Take a copy updated_subsets = all_subsets[:] # Replace the current one with the split one # Google "slice assignments" # This list is the key to avoiding recursion. It expands to contain # all of the split subsets by replacing them with the split ones updated_subsets[subset_index:subset_index+1] = split_subsets test_scheme = scheme.Scheme(self.cfg, "Current Scheme", updated_subsets) try: best_result = self.analyse_scheme(best_scheme) new_result = self.analyse_scheme(test_scheme) log.info("Current best score is: " + str(best_result)) log.info("Current new score is: " + str(new_result)) if new_result.score < best_result.score: log.info("New score " + str(subset_index) + " is better and will be set to best score") best_scheme = test_scheme # Change this to the one with split subsets in it. Note that # the subset_index now points a NEW subset, one that was split all_subsets = updated_subsets else: # Move to the next subset in the all_subsets list subset_index += 1 # In PhyML or RAxML, it is likely because of no alignment patterns, # catch that and move to the next subset without splitting. except PhylogenyProgramError: log.info("Phylogeny program generated an error so this subset was not split, see error above") subset_index += 1 # Now start the Greedy Analysis: need to figure out how to make it go through more # than one scheme... start_scheme = best_scheme partnum = len(start_scheme.subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) start_description = range(partnum) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) old_best_score = self.results.best_score # Get an iterable of all possible pairs of subsets in best_scheme lumped_subsets = itertools.combinations(start_scheme.subsets, 2) for subset_grouping in lumped_subsets: scheme_name = cur_s lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) cur_s += 1 if self.results.best_score != old_best_score: log.info("Analysed all schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.model_selection, (self.results.best_score - old_best_score)) self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed all schemes for this step and found no schemes " "that improve the score, stopping") break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): # Copied and pasted from greedy analysis partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) old_score = self.analyse_scheme(start_scheme) # Get first scheme best_scheme = start_scheme subset_index = 0 processor = self.cfg.processor alignment_path = self.filtered_alignment_path tree_path = processor.make_tree_path(alignment_path) split_subsets = [] for a_subset in start_scheme: how_many = kmeans.kmeans_wrapper(self.cfg, self.alignment, a_subset, tree_path) split_subsets += how_many split_scheme = scheme.Scheme(self.cfg, "split_scheme", split_subsets) best_result = self.analyse_scheme(best_scheme) split_score = self.analyse_scheme(split_scheme) if split_score.score < best_result.score: best_scheme = split_scheme log.info("Initial splits generated superior scheme") all_subsets = list(best_scheme.subsets) fabricated_subsets =[] step = 1 while subset_index < len(all_subsets): log.info("Best scheme has %s score of %.2f and %d subset(s)" %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets))) log.info("***Kmeans algorithm step %d***" % step) step += 1 current_subset = all_subsets[subset_index] log.info("Analysing subset of %d sites", len(current_subset.columns)) # First check if the subset is large enough to split, if it isn't, # move to the next subset if len(current_subset.columns) == 1: log.info("This subset cannot be split further") subset_index += 1 continue if current_subset.fabricated: log.info("This subset cannot be split further because %s cannot analyse it", self.cfg.phylogeny_program) subset_index += 1 fabricated_subsets.append(current_subset) continue split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path) # kmeans_split_subset will return a 1 and flag the subset as # fabricated if for some reason it raises a PhylogenyProgramError, # this it to catch those fabricated subsets if split_subsets == 1: subset_index += 1 fabricated_subsets.append(current_subset) continue for each_subset in split_subsets: log.info("Subset resulting from split is %d sites long", len(each_subset.columns)) # Take a copy updated_subsets = all_subsets[:] # Replace the current one with the split one # Google "slice assignments" # This list is the key to avoiding recursion. It expands to contain # all of the split subsets by replacing them with the split ones updated_subsets[subset_index:subset_index+1] = split_subsets test_scheme = scheme.Scheme(self.cfg, str(step-1), updated_subsets) new_result = self.analyse_scheme(test_scheme) if new_result.score < best_result.score: best_scheme = test_scheme best_result = new_result # Change this to the one with split subsets in it. Note that # the subset_index now points a NEW subset, one that was split all_subsets = updated_subsets # record each scheme that's an improvement self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) if len(split_subsets)==2: log.info("Splitting subset into %d:%d sites improved the %s score" %(len(split_subsets[0].columns), len(split_subsets[1].columns), self.cfg.model_selection)) for s in split_subsets: m = [x%3 for x in s.columns] l = float(len(s.columns)) props = [(float(m.count(1))/l), (float(m.count(2))/l), (float(m.count(0))/l)] log.info("%d subset has 1st, 2nd, 3rd props: %s" %(len(s.columns), str(props))) else: log.info("Splitting this subset did not improve the %s score", self.cfg.model_selection.upper()) # Move to the next subset in the all_subsets list subset_index += 1 log.info("Best scheme has %s score of %.2f and %d subset(s)" %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets))) if fabricated_subsets: log.info("Finalising partitioning scheme") log.info("This involves cleaning up small subsets which %s " "can't analyse", self.cfg.phylogeny_program) # Now join the fabricated subsets back up with other subsets while fabricated_subsets: log.info("***Kmeans algorithm step %d***" % step) step += 1 # Take the first subset in the list (to be "popped" off later) s = fabricated_subsets[0] centroid = s.centroid best_match = None # Take a list copy of the best scheme scheme_list = list(best_scheme) scheme_list.remove(s) # Loop through the subsets in the best scheme and find the one # with the nearest centroid for sub in scheme_list: centroid_array = [sub.centroid, centroid] # euclid_dist = abs(sub.centroid[0] - centroid[0]) warnings.simplefilter('ignore', DeprecationWarning) euclid_dist = spatial.distance.pdist(centroid_array) if euclid_dist < best_match or best_match == None: best_match = euclid_dist closest_sub = sub # Now merge those subsets merged_sub = subset_ops.merge_fabricated_subsets([s, closest_sub]) # Remove the offending subset from the fabricated subset list fabricated_subsets.pop(0) # If the closest subset happens to be "fabricated" as well, take # it out of the fabricated_subsets list if closest_sub in fabricated_subsets: fabricated_subsets.remove(closest_sub) # Get rid of the two subsets that were merged from the best_scheme scheme_list.remove(closest_sub) # Now add the new subset to the scheme and see if the new subset # can be analyzed scheme_list.append(merged_sub) merged_scheme = scheme.Scheme(self.cfg, str(step-1), scheme_list) merged_result = self.analyse_scheme(merged_scheme) # If it can be analyzed, move the algorithm forward, if it can't # be analyzed add it to the list of fabricated_subsets for new_subs in merged_scheme: if new_subs.fabricated and new_subs not in fabricated_subsets: fabricated_subsets.append(new_subs) best_scheme = merged_scheme best_result = merged_result # Since the AIC will likely be better before we dealt with the # fabricated subsets, we need to set the best scheme and best result # to those from the last merged_scheme. TODO: add a variable to scheme # to take care of this problem so that the best AND analysable scheme # is the one that gets automatically flagged as the best scheme self.results.best_scheme = best_scheme self.results.best_result = best_result self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) log.info("** Kmeans algorithm finished after %d steps **" % (step - 1)) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) old_best_score = self.results.best_score # Get an iterable of all possible pairs of subsets in best_scheme lumped_subsets = itertools.combinations(start_scheme.subsets, 2) for subset_grouping in lumped_subsets: scheme_name = cur_s lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) cur_s += 1 if self.results.best_score != old_best_score: log.info("Analysed all schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.model_selection, (self.results.best_score - old_best_score)) self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed all schemes for this step and found no schemes " "that improve the score, stopping") break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)