def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.partitions)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            # Get a list of all possible lumpings of the best_scheme
            lumpings = algorithm.lumpings(start_description)

            # Save the current best score we have in results
            old_best_score = self.results.best_score
            for lumped_description in lumpings:
                lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description)
                cur_s += 1
                # This is just checking to see if a scheme is any good, if it
                # is, we remember and write it later
                self.analyse_scheme(lumped_scheme)

            # Did out best score change (It ONLY gets better -- see in
            # results.py)
            if self.results.best_score == old_best_score:
                # It didn't, so we're done
                break

            # Let's look further. We use the description from our best scheme
            # (which will be the one that just changed in the last lumpings
            # iteration)
            start_description = self.results.best_result.scheme.description

            # Rename and record the best scheme for this step
            self.results.best_scheme.name = "step_%d" % step
            self.cfg.reporter.write_scheme_summary(
                self.results.best_scheme, self.results.best_result)

            # If it's the scheme with everything equal, quit
            if len(set(start_description)) == 1:
                break

            # Go do the next round...
            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, self.cfg.model_selection,
                  self.results.best_score))

        txt = "Best scheme according to Greedy algorithm, analysed with %s" % self.cfg.model_selection
        self.cfg.reporter.write_best_scheme(txt, self.results)
示例#2
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log, "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim*dim)-dim))/2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)


                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s", str(best_change))

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged, the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score


                log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))


                log.info("The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, the_config.model_selection,
                    self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
示例#3
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log,
                                   "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim * dim) - dim)) / 2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes  # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s",
                          str(best_change))

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change,
                                                    subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged,
                    the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score

                log.info("Best scheme combines subsets: '%s' and '%s'" %
                         (best_pair[0].name, best_pair[1].name))

                log.info(
                    "The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection, np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair),
                                                    [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                  [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, the_config.model_selection,
                  self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        """A greedy algorithm for heuristic partitioning searches"""
        log.info("Performing greedy analysis")
        models = self.cfg.models
        model_selection = self.cfg.model_selection
        partnum = len(self.cfg.partitions)

        self.total_scheme_num = submodels.count_greedy_schemes(partnum)
        log.info("This will result in a maximum of %s schemes being created", self.total_scheme_num)

        self.total_subset_num = submodels.count_greedy_subsets(partnum)
        log.info(
            "PartitionFinder will have to analyse a maximum of %d subsets of sites to complete this analysis"
            % (self.total_subset_num)
        )

        if self.total_subset_num > 10000:
            log.warning("%d is a lot of subsets, this might take a long time to analyse", self.total_subset_num)
            log.warning("Perhaps consider using a different search scheme instead (see Manual)")

        # clear any schemes that are currently loaded
        # TODO Not sure we need this...
        self.cfg.schemes.clear_schemes()

        # start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(self.cfg, 1, start_description)
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        result = self.analyse_scheme(start_scheme, models)

        def get_score(my_result):
            # TODO: this is bad. Should use self.cfg.model_selection, or write
            # a new model_selection for scheme.py
            if model_selection == "aic":
                score = my_result.aic
            elif model_selection == "aicc":
                score = my_result.aicc
            elif model_selection == "bic":
                score = my_result.bic
            else:
                log.error("Unrecognised model_selection variable '%s', please check" % (score))
                raise AnalysisError
            return score

        best_result = result
        best_score = get_score(result)

        step = 1
        cur_s = 2

        # now we try out all lumpings of the current scheme, to see if we can find a better one
        # and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            # get a list of all possible lumpings of the best_scheme
            lumpings = algorithm.lumpings(start_description)

            # we reset the counters as we go, for better user information
            self.total_scheme_num = len(lumpings)
            self.schemes_analysed = 0

            best_lumping_score = None
            for lumped_description in lumpings:
                lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description)
                cur_s += 1
                result = self.analyse_scheme(lumped_scheme, models)
                new_score = get_score(result)

                if best_lumping_score == None or new_score < best_lumping_score:
                    best_lumping_score = new_score
                    best_lumping_result = result
                    best_lumping_scheme = lumped_scheme
                    best_lumping_desc = lumped_description

            if best_lumping_score < best_score:
                best_scheme = best_lumping_scheme
                best_score = best_lumping_score
                best_result = best_lumping_result
                start_description = best_lumping_desc
                if len(set(best_lumping_desc)) == 1:  # then it's the scheme with everything equal, so quit
                    break
                step += 1

            else:
                break

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info(
            "Highest scoring scheme is scheme %s, with %s score of %.3f"
            % (best_result.scheme.name, model_selection, best_score)
        )

        self.best_result = best_result
示例#5
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.partitions)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(self.cfg, "start_scheme",
                                            start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            # Get a list of all possible lumpings of the best_scheme
            lumpings = algorithm.lumpings(start_description)

            # Save the current best score we have in results
            old_best_score = self.results.best_score
            for lumped_description in lumpings:
                lumped_scheme = scheme.create_scheme(self.cfg, cur_s,
                                                     lumped_description)
                cur_s += 1
                # This is just checking to see if a scheme is any good, if it
                # is, we remember and write it later
                self.analyse_scheme(lumped_scheme)

            # Did out best score change (It ONLY gets better -- see in
            # results.py)
            if self.results.best_score == old_best_score:
                # It didn't, so we're done
                break

            # Let's look further. We use the description from our best scheme
            # (which will be the one that just changed in the last lumpings
            # iteration)
            start_description = self.results.best_result.scheme.description

            # Rename and record the best scheme for this step
            self.results.best_scheme.name = "step_%d" % step
            self.cfg.reporter.write_scheme_summary(self.results.best_scheme,
                                                   self.results.best_result)

            # If it's the scheme with everything equal, quit
            if len(set(start_description)) == 1:
                break

            # Go do the next round...
            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, self.cfg.model_selection,
                  self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        # Copied and pasted from greedy analysis
        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)


        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        old_score = self.analyse_scheme(start_scheme)

        # Get first scheme
        best_scheme = start_scheme
        subset_index = 0
        all_subsets = list(best_scheme.subsets)
        processor = self.cfg.processor
        alignment_path = self.filtered_alignment_path
        tree_path = processor.make_tree_path(alignment_path)


        while subset_index < len(all_subsets):
            current_subset = all_subsets[subset_index]
            split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path)

            if split_subsets == 1:
                subset_index += 1

            else:
                # Take a copy
                updated_subsets = all_subsets[:]

                # Replace the current one with the split one
                # Google "slice assignments"
                # This list is the key to avoiding recursion. It expands to contain
                # all of the split subsets by replacing them with the split ones
                updated_subsets[subset_index:subset_index+1] = split_subsets

                test_scheme = scheme.Scheme(self.cfg, "Current Scheme", updated_subsets)

                try:
                    best_result = self.analyse_scheme(best_scheme)
                    new_result = self.analyse_scheme(test_scheme)

                    log.info("Current best score is: " + str(best_result))
                    log.info("Current new score is: " + str(new_result))
                    if new_result.score < best_result.score:
                        log.info("New score " + str(subset_index) + " is better and will be set to best score")
                        best_scheme = test_scheme

                        # Change this to the one with split subsets in it. Note that
                        # the subset_index now points a NEW subset, one that was split
                        all_subsets = updated_subsets
                    else:
                        # Move to the next subset in the all_subsets list
                        subset_index += 1

                # In PhyML or RAxML, it is likely because of no alignment patterns,
                # catch that and move to the next subset without splitting.
                except PhylogenyProgramError:
                    log.info("Phylogeny program generated an error so this subset was not split, see error above")
                    subset_index += 1
        # Now start the Greedy Analysis: need to figure out how to make it go through more
        # than one scheme...

        start_scheme = best_scheme
        partnum = len(start_scheme.subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)
        self.cfg.progress.begin(scheme_count, subset_count)
        start_description = range(partnum)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            old_best_score = self.results.best_score

            # Get an iterable of all possible pairs of subsets in best_scheme
            lumped_subsets = itertools.combinations(start_scheme.subsets, 2)

            for subset_grouping in lumped_subsets:
                scheme_name = cur_s
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f",
                          self.cfg.model_selection,
                          (new_result.score-old_best_score))

                cur_s += 1

            if self.results.best_score != old_best_score:
                log.info("Analysed all schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.",
                         self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed all schemes for this step and found no schemes "
                         "that improve the score, stopping")
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        # Copied and pasted from greedy analysis
        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)


        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        old_score = self.analyse_scheme(start_scheme)

        # Get first scheme
        best_scheme = start_scheme
        subset_index = 0

        processor = self.cfg.processor
        alignment_path = self.filtered_alignment_path
        tree_path = processor.make_tree_path(alignment_path)

        split_subsets = []
        for a_subset in start_scheme:
            how_many = kmeans.kmeans_wrapper(self.cfg, self.alignment,
                a_subset, tree_path)
            split_subsets += how_many
        split_scheme = scheme.Scheme(self.cfg, "split_scheme", split_subsets)
        best_result = self.analyse_scheme(best_scheme)
        split_score = self.analyse_scheme(split_scheme)
        if split_score.score < best_result.score:
            best_scheme = split_scheme
            log.info("Initial splits generated superior scheme")
        all_subsets = list(best_scheme.subsets)

        fabricated_subsets =[]
        step = 1


        while subset_index < len(all_subsets):
            log.info("Best scheme has %s score of %.2f and %d subset(s)"
                     %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets)))

            log.info("***Kmeans algorithm step %d***" % step)
            step += 1

            current_subset = all_subsets[subset_index]

            log.info("Analysing subset of %d sites", len(current_subset.columns))

            # First check if the subset is large enough to split, if it isn't,
            # move to the next subset
            if len(current_subset.columns) == 1:
                log.info("This subset cannot be split further")
                subset_index += 1
                continue

            if current_subset.fabricated:
                log.info("This subset cannot be split further because %s cannot analyse it",
                        self.cfg.phylogeny_program)
                subset_index += 1
                fabricated_subsets.append(current_subset)
                continue

            split_subsets = kmeans.kmeans_split_subset(self.cfg,
                self.alignment, current_subset, tree_path)


            # kmeans_split_subset will return a 1 and flag the subset as
            # fabricated if for some reason it raises a PhylogenyProgramError,
            # this it to catch those fabricated subsets
            if split_subsets == 1:
                subset_index += 1
                fabricated_subsets.append(current_subset)
                continue

            for each_subset in split_subsets:
                log.info("Subset resulting from split is %d sites long", len(each_subset.columns))

            # Take a copy
            updated_subsets = all_subsets[:]


            # Replace the current one with the split one
            # Google "slice assignments"
            # This list is the key to avoiding recursion. It expands to contain
            # all of the split subsets by replacing them with the split ones
            updated_subsets[subset_index:subset_index+1] = split_subsets

            test_scheme = scheme.Scheme(self.cfg, str(step-1),
                updated_subsets)

            new_result = self.analyse_scheme(test_scheme)

            if new_result.score < best_result.score:
                best_scheme = test_scheme
                best_result = new_result

                # Change this to the one with split subsets in it. Note that
                # the subset_index now points a NEW subset, one that was split
                all_subsets = updated_subsets

                # record each scheme that's an improvement
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                if len(split_subsets)==2:
                    log.info("Splitting subset into %d:%d sites improved the %s score"
                              %(len(split_subsets[0].columns),
                                len(split_subsets[1].columns),
                                self.cfg.model_selection))

                    for s in split_subsets:
                       m = [x%3 for x in s.columns]
                       l = float(len(s.columns))
                       props = [(float(m.count(1))/l), (float(m.count(2))/l), (float(m.count(0))/l)]
                       log.info("%d subset has 1st, 2nd, 3rd props: %s" %(len(s.columns), str(props)))

            else:
                log.info("Splitting this subset did not improve the %s score",
                         self.cfg.model_selection.upper())
                # Move to the next subset in the all_subsets list
                subset_index += 1

        log.info("Best scheme has %s score of %.2f and %d subset(s)"
                 %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets)))

        if fabricated_subsets:
            log.info("Finalising partitioning scheme")
            log.info("This involves cleaning up small subsets which %s "
                     "can't analyse", self.cfg.phylogeny_program)

        # Now join the fabricated subsets back up with other subsets
        while fabricated_subsets:
            log.info("***Kmeans algorithm step %d***" % step)
            step += 1

            # Take the first subset in the list (to be "popped" off later)
            s = fabricated_subsets[0]
            centroid = s.centroid

            best_match = None

            # Take a list copy of the best scheme
            scheme_list = list(best_scheme)
            scheme_list.remove(s)
            # Loop through the subsets in the best scheme and find the one
            # with the nearest centroid
            for sub in scheme_list:
                centroid_array = [sub.centroid, centroid]
                # euclid_dist = abs(sub.centroid[0] - centroid[0])
                warnings.simplefilter('ignore', DeprecationWarning)
                euclid_dist = spatial.distance.pdist(centroid_array)
                if euclid_dist < best_match or best_match == None:
                    best_match = euclid_dist
                    closest_sub = sub

            # Now merge those subsets
            merged_sub = subset_ops.merge_fabricated_subsets([s, closest_sub])

            # Remove the offending subset from the fabricated subset list
            fabricated_subsets.pop(0)
            # If the closest subset happens to be "fabricated" as well, take
            # it out of the fabricated_subsets list
            if closest_sub in fabricated_subsets:
                fabricated_subsets.remove(closest_sub)

            # Get rid of the two subsets that were merged from the best_scheme
            scheme_list.remove(closest_sub)

            # Now add the new subset to the scheme and see if the new subset
            # can be analyzed
            scheme_list.append(merged_sub)
            merged_scheme = scheme.Scheme(self.cfg, str(step-1), scheme_list)

            merged_result = self.analyse_scheme(merged_scheme)
            # If it can be analyzed, move the algorithm forward, if it can't
            # be analyzed add it to the list of fabricated_subsets
            for new_subs in merged_scheme:
                if new_subs.fabricated and new_subs not in fabricated_subsets:
                    fabricated_subsets.append(new_subs)
            best_scheme = merged_scheme
            best_result = merged_result

        # Since the AIC will likely be better before we dealt with the
        # fabricated subsets, we need to set the best scheme and best result
        # to those from the last merged_scheme. TODO: add a variable to scheme
        # to take care of this problem so that the best AND analysable scheme
        # is the one that gets automatically flagged as the best scheme
        self.results.best_scheme = best_scheme
        self.results.best_result = best_result

        self.cfg.reporter.write_scheme_summary(
            self.results.best_scheme, self.results.best_result)

        log.info("** Kmeans algorithm finished after %d steps **" % (step - 1))
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            old_best_score = self.results.best_score

            # Get an iterable of all possible pairs of subsets in best_scheme
            lumped_subsets = itertools.combinations(start_scheme.subsets, 2)

            for subset_grouping in lumped_subsets:
                scheme_name = cur_s
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f",
                          self.cfg.model_selection,
                          (new_result.score-old_best_score))

                cur_s += 1

            if self.results.best_score != old_best_score:
                log.info("Analysed all schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.",
                         self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed all schemes for this step and found no schemes "
                         "that improve the score, stopping")
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)