Exemplo n.º 1
0
    def install_base(self):
        db_conn = psycopg2.connect(
            host=pg_conf['host'],
            port=pg_conf['port'],
            user=pg_conf['login'],
            password=pg_conf['password'],
            database=pg_conf['db_name'],
            cursor_factory=DictCursor
        )
        create_scheme(db_conn, self.options['current_folder']+'/scheme/scheme.sql')
        parsers = [
            #AddressObjectType,
            #CenterStatus,
            #CurrentStatus,
            #OperationStatus,
            #ActualStatus,
            #IntervalStatus,
            #StructureStatus,
            #HouseStateStatus,
            #EstateStatus,
            #NormativeDocumentType,
            #NormativeDocument,
            #AddressObject,
            #House,
            #HouseInterval,
            # Room,
            Stead,
            Landmark,

        ]

        for parser in parsers:
            obj_parser = parser(db_connection=db_conn, archive=self.options['current_folder']+'/fias_xml.rar')
            obj_parser.on('start_element:handled', change_state)
            state['install'][parser.__name__] = {
                'status': 'started',
                'started_at': str(datetime.now()),
                'stopped_at': None,
                'records_added': obj_parser.records_counter
            }
            try:
                obj_parser.parse()
            except Exception as e:
                state['install'][parser.__name__].update({
                        'status': 'failed',
                        'reason': str(e),
                        'stopped_at': str(datetime.now()),
                        'records_added': obj_parser.records_counter
                    })
            db_conn.commit()
            state['install'][parser.__name__].update({
                'status': 'complete',
                'stopped_at': str(datetime.now()),
                'records_added': obj_parser.records_counter
            })
Exemplo n.º 2
0
    def setup(self):

        log.warning(
            "Warning as of April 2016: We have noticed that the kmeans \
            algorithm does not perform well on some simulated datasets. \
            We are working on investigating and addressing this \
            but in the mean time we suggest being very cautious about using \
            this algorithm. At the very least, you should try other approaches \
            (e.g. partitioning by locus), and investigate your answers carefully \
            (both the trees and the partitioning schemes). If you have any \
            questions, please get in touch on the google group. Note that this \
            warning does not apply to cases where you are using models that have \
            an ascertainment bias for datasets that include only variable sites \
            as is often the case with morphological analyses.")

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        site_max = sum([len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." % (the_config.min_subset_size, site_max))
            raise AnalysisError

        with logtools.indented(
                log, "**Analysing starting scheme (scheme %s)**" %
                start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology':
            log.error("You have selected kmeans and tiger \
                rates. This is an unsupported option for anything except \
                morphological data. The kmeans algorithm \
                now works with entropies, not TIGER rates.")
            raise AnalysisError

        return start_result, start_scheme, tree_path
    def do_analysis(self):
        log.info("Performing clustering analysis")

        partnum = len(self.cfg.partitions)
        subset_count = 2 * partnum - 1
        scheme_count = partnum
        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)

        # Analyse our first scheme
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        # Current scheme number
        cur_s = 2

        # Now we try out all clusterings of the first scheme, to see if we can
        # find a better one
        while True:
            log.info("***Clustering algorithm step %d of %d***" %
                     (cur_s - 1, partnum - 1))

            # Calculate the subsets which are most similar
            # e.g. combined rank ordering of euclidean distances
            # Could combine average site-rates, q matrices, and frequencies
            scheme_name = "step_%d" % (cur_s - 1)
            clustered_scheme = neighbour.get_nearest_neighbour_scheme(
                start_scheme, scheme_name, self.cfg)

            # Now analyse that new scheme
            cur_s += 1
            self.analyse_scheme(clustered_scheme)

            # Stop when we've anlaysed the scheme with all subsets combined
            if len(set(clustered_scheme.subsets)) == 1:  # then it's the scheme with everything together
                break
            else:
                start_scheme = clustered_scheme

        self.cfg.progress.end()

        txt = "Best scheme using Clustering Analysis, analysed with %s" % self.cfg.model_selection
        self.cfg.reporter.write_best_scheme(txt, self.results)
Exemplo n.º 4
0
    def do_analysis(self):
        log.info("Performing strict clustering analysis")

        partnum = len(the_config.user_subsets)
        subset_count = 2 * partnum - 1
        scheme_count = partnum
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        # Analyse our first scheme
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        # Current scheme number
        cur_s = 2

        # Now we try out all clusterings of the first scheme, to see if we can
        # find a better one
        while True:
            log.info("***Strict clustering algorithm step %d of %d***" %
                     (cur_s - 1, partnum - 1))

            # Calculate the subsets which are most similar
            # e.g. combined rank ordering of euclidean distances
            # Could combine average site-rates, q matrices, and frequencies
            scheme_name = "step_%d" % (cur_s - 1)
            clustered_scheme = neighbour.get_nearest_neighbour_scheme(
                start_scheme, scheme_name, the_config)

            # Now analyse that new scheme
            cur_s += 1
            self.analyse_scheme(clustered_scheme)

            # Stop when we've analysed the scheme with all subsets combined...
            if len(set(clustered_scheme.subsets)) == 1:
                # ... then it's the scheme with everything together
                break
            else:
                # We keep going
                start_scheme = clustered_scheme

        the_config.progress.end()
        the_config.reporter.write_best_scheme(self.results)
 def do_analysis(self):
     partnum = len(self.cfg.user_subsets)
     start_description = range(partnum)
     log.info("Performing subset splitting using kmeans")
     # Create the first scheme
     start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description)
     for i in start_scheme:
         # Save the alignment path
         i.make_alignment(self.cfg, self.alignment)
         phylip_file = i.alignment_path
         print phylip_file
         # Add option to output likelihoods, *raxml version takes more 
         # modfying of the commands in the analyse function
         phyml.analyse("GTR", str(phylip_file), "./analysis/start_tree/filtered_source.phy_phyml_tree.txt", "unlinked", "--print_site_lnl")
         phyml_lk_file = str(phylip_file) + "_phyml_lk_GTR.txt"
         likelihood_dictionary = kmeans.phyml_likelihood_parser(phyml_lk_file)
         kmeans.kmeans(likelihood_dictionary)
     print start_scheme
Exemplo n.º 6
0
    def setup(self):

        if the_config.datatype != 'morphology':
            log.warning("METHOD DISCONTINUED: \
                There is increasing evidence that the kmeans \
                algorithm can lead to poor inferences, so we have \
                discontinued its use for most data types. \
                You should instead use other approaches \
                (e.g. partitioning by locus and codon position). If you have any \
                questions, please get in touch on the google group. More \
                information on the empirical issues \
                can be found in this paper: \
                http://www.sciencedirect.com/science/article/pii/S1055790316302780."
                )
            raise AnalysisError
        else:
            log.warning("USE CAUTION: \
                There is increasing evidence that the kmeans \
                algorithm can lead to poor inferences, so we have \
                discontinued its use for most data types \
                (i.e. amino acid and nucleotide data). \
                More information on the empirical issues \
                can be found in this paper: \
                http://www.sciencedirect.com/science/article/pii/S1055790316302780. \
                We have kept the method available for morphological \
                data, but warn users that the method is: experimental, \
                untested on morphological data (either empirical or \
                simulated), and may give incorrect topologies and branch \
                lengths (see link to paper above)." 
                )


        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            the_config, "start_scheme", start_description)

        site_max = sum([ len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." %(the_config.min_subset_size, site_max)
                )
            raise AnalysisError


        with logtools.indented(log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology':
            log.error("You have selected kmeans and tiger \
                rates. This is an unsupported option for anything except \
                morphological data. The kmeans algorithm \
                now works with entropies, not TIGER rates.")
            raise AnalysisError

        return start_result, start_scheme, tree_path
Exemplo n.º 7
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        if the_config.cluster_max == -987654321:
            the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))])
            log.info("Set rcluster-max to %d" %the_config.cluster_max) 

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)



        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***"
                % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" % max_schemes)
                d_matrix = neighbour.get_distance_matrix(subsets,
                    the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix<0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged, the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
                    log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change


                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score


                log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets),
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score


                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)


                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, model_selection,
                    self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                     % (self.results.best_scheme.name, model_selection,
                        self.results.best_score))


        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 8
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log, "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim*dim)-dim))/2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)


                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s", str(best_change))

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged, the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score


                log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))


                log.info("The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, the_config.model_selection,
                    self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 9
0
groups = grouped(matrix_out.tolist())
print('\nGroups:')
for i in groups.items():
	#print('\t', i[0], 'group:', i[1],  set([k for j in i[1] for k in matrix_opr[j]]))
	print("\t {0:<1d} group: {1!s:<30s} -> {2!s:<100s}".format(i[0], i[1], set([k for j in i[1] for k in matrix_opr[j]])))

specify_group1 = gks3(groups, matrix_opr) # some bug with this fubction
refin_groups = gks3(groups, matrix_opr) # for it works well we must call twice this func
print('\nRefined groups:')
for i in refin_groups.items():
	#print('\t', i[0], 'group:', i[1], set([k for j in i[1] for k in matrix_opr[j]]))
	print("\t {0:<1d} group: {1!s:<35s} -> {2!s:<100s}".format(i[0], i[1], set([k for j in i[1] for k in matrix_opr[j]])))
    	
module = {}
for i in refin_groups.keys():
	module[i] = draw_graph_create_module(matrix_opr, refin_groups[i], 'group'+str(i))
print('\nModules:')
for i in module.items():
	print('\t', i[0], 'group:')
	for j in i[1].items():
		print('\t', j[0], j[1])
	print('\n')

un_mod = unique_module(module)
print('\nRefined modules:')
for i in un_mod.items():
	print('\t', i[0], ':', i[1])
	
create_scheme(matrix_opr, un_mod)
input("\nTo exit, press any key")
Exemplo n.º 10
0
    def do_analysis(self):
        # Copied and pasted from greedy analysis
        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)


        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        old_score = self.analyse_scheme(start_scheme)

        # Get first scheme
        best_scheme = start_scheme
        subset_index = 0

        processor = self.cfg.processor
        alignment_path = self.filtered_alignment_path
        tree_path = processor.make_tree_path(alignment_path)

        split_subsets = []
        for a_subset in start_scheme:
            how_many = kmeans.kmeans_wrapper(self.cfg, self.alignment,
                a_subset, tree_path)
            split_subsets += how_many
        split_scheme = scheme.Scheme(self.cfg, "split_scheme", split_subsets)
        best_result = self.analyse_scheme(best_scheme)
        split_score = self.analyse_scheme(split_scheme)
        if split_score.score < best_result.score:
            best_scheme = split_scheme
            log.info("Initial splits generated superior scheme")
        all_subsets = list(best_scheme.subsets)

        fabricated_subsets =[]
        step = 1


        while subset_index < len(all_subsets):
            log.info("Best scheme has %s score of %.2f and %d subset(s)"
                     %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets)))

            log.info("***Kmeans algorithm step %d***" % step)
            step += 1

            current_subset = all_subsets[subset_index]

            log.info("Analysing subset of %d sites", len(current_subset.columns))

            # First check if the subset is large enough to split, if it isn't,
            # move to the next subset
            if len(current_subset.columns) == 1:
                log.info("This subset cannot be split further")
                subset_index += 1
                continue

            if current_subset.fabricated:
                log.info("This subset cannot be split further because %s cannot analyse it",
                        self.cfg.phylogeny_program)
                subset_index += 1
                fabricated_subsets.append(current_subset)
                continue

            split_subsets = kmeans.kmeans_split_subset(self.cfg,
                self.alignment, current_subset, tree_path)


            # kmeans_split_subset will return a 1 and flag the subset as
            # fabricated if for some reason it raises a PhylogenyProgramError,
            # this it to catch those fabricated subsets
            if split_subsets == 1:
                subset_index += 1
                fabricated_subsets.append(current_subset)
                continue

            for each_subset in split_subsets:
                log.info("Subset resulting from split is %d sites long", len(each_subset.columns))

            # Take a copy
            updated_subsets = all_subsets[:]


            # Replace the current one with the split one
            # Google "slice assignments"
            # This list is the key to avoiding recursion. It expands to contain
            # all of the split subsets by replacing them with the split ones
            updated_subsets[subset_index:subset_index+1] = split_subsets

            test_scheme = scheme.Scheme(self.cfg, str(step-1),
                updated_subsets)

            new_result = self.analyse_scheme(test_scheme)

            if new_result.score < best_result.score:
                best_scheme = test_scheme
                best_result = new_result

                # Change this to the one with split subsets in it. Note that
                # the subset_index now points a NEW subset, one that was split
                all_subsets = updated_subsets

                # record each scheme that's an improvement
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                if len(split_subsets)==2:
                    log.info("Splitting subset into %d:%d sites improved the %s score"
                              %(len(split_subsets[0].columns),
                                len(split_subsets[1].columns),
                                self.cfg.model_selection))

                    for s in split_subsets:
                       m = [x%3 for x in s.columns]
                       l = float(len(s.columns))
                       props = [(float(m.count(1))/l), (float(m.count(2))/l), (float(m.count(0))/l)]
                       log.info("%d subset has 1st, 2nd, 3rd props: %s" %(len(s.columns), str(props)))

            else:
                log.info("Splitting this subset did not improve the %s score",
                         self.cfg.model_selection.upper())
                # Move to the next subset in the all_subsets list
                subset_index += 1

        log.info("Best scheme has %s score of %.2f and %d subset(s)"
                 %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets)))

        if fabricated_subsets:
            log.info("Finalising partitioning scheme")
            log.info("This involves cleaning up small subsets which %s "
                     "can't analyse", self.cfg.phylogeny_program)

        # Now join the fabricated subsets back up with other subsets
        while fabricated_subsets:
            log.info("***Kmeans algorithm step %d***" % step)
            step += 1

            # Take the first subset in the list (to be "popped" off later)
            s = fabricated_subsets[0]
            centroid = s.centroid

            best_match = None

            # Take a list copy of the best scheme
            scheme_list = list(best_scheme)
            scheme_list.remove(s)
            # Loop through the subsets in the best scheme and find the one
            # with the nearest centroid
            for sub in scheme_list:
                centroid_array = [sub.centroid, centroid]
                # euclid_dist = abs(sub.centroid[0] - centroid[0])
                warnings.simplefilter('ignore', DeprecationWarning)
                euclid_dist = spatial.distance.pdist(centroid_array)
                if euclid_dist < best_match or best_match == None:
                    best_match = euclid_dist
                    closest_sub = sub

            # Now merge those subsets
            merged_sub = subset_ops.merge_fabricated_subsets([s, closest_sub])

            # Remove the offending subset from the fabricated subset list
            fabricated_subsets.pop(0)
            # If the closest subset happens to be "fabricated" as well, take
            # it out of the fabricated_subsets list
            if closest_sub in fabricated_subsets:
                fabricated_subsets.remove(closest_sub)

            # Get rid of the two subsets that were merged from the best_scheme
            scheme_list.remove(closest_sub)

            # Now add the new subset to the scheme and see if the new subset
            # can be analyzed
            scheme_list.append(merged_sub)
            merged_scheme = scheme.Scheme(self.cfg, str(step-1), scheme_list)

            merged_result = self.analyse_scheme(merged_scheme)
            # If it can be analyzed, move the algorithm forward, if it can't
            # be analyzed add it to the list of fabricated_subsets
            for new_subs in merged_scheme:
                if new_subs.fabricated and new_subs not in fabricated_subsets:
                    fabricated_subsets.append(new_subs)
            best_scheme = merged_scheme
            best_result = merged_result

        # Since the AIC will likely be better before we dealt with the
        # fabricated subsets, we need to set the best scheme and best result
        # to those from the last merged_scheme. TODO: add a variable to scheme
        # to take care of this problem so that the best AND analysable scheme
        # is the one that gets automatically flagged as the best scheme
        self.results.best_scheme = best_scheme
        self.results.best_result = best_result

        self.cfg.reporter.write_scheme_summary(
            self.results.best_scheme, self.results.best_result)

        log.info("** Kmeans algorithm finished after %d steps **" % (step - 1))
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        log.info("Performing greediest analysis")

        stop_at = self.cfg.greediest_percent * 0.01

        model_selection = self.cfg.model_selection
        partnum = len(self.cfg.partitions)

        scheme_count = submodels.count_greediest_schemes(partnum, self.cfg.greediest_percent)
        subset_count = submodels.count_greediest_subsets(partnum, self.cfg.greediest_percent)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)
        self.cfg.reporter.write_scheme_summary(
            self.results.best_scheme, self.results.best_result)


        # Start by remembering that we analysed the starting scheme
        subset_counter = 1
        step = 1
        while True:

            log.info("***Greediest algorithm step %d of %d***" % (step, partnum - 1))
            name_prefix = "step_%d" % (step)

            # Get a list of all possible lumpings of the best_scheme, ordered
            # according to the clustering weights
            lumped_subsets = neighbour.get_ranked_clustered_subsets(
                start_scheme, self.cfg)

            # reduce the size of the lumped subsets to greediest_percent long
            cutoff = int(math.ceil(len(lumped_subsets)*stop_at)) #round up to stop zeros            
            lumped_subsets = lumped_subsets[:cutoff]

            # Now analyse the lumped schemes
            lumpings_done = 0
            old_best_score = self.results.best_score

            for subset_grouping in lumped_subsets:
                scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1)
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)
                
                log.info("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score))
                
                lumpings_done += 1

            
            if self.results.best_score != old_best_score:
                log.info("Analysed %.1f percent of the schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.", 
                         self.cfg.greediest_percent, self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                #write out the best scheme
                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)


                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed %.1f percent of the schemes for this step and found no schemes "
                         "that improve the score, stopping" , self.cfg.greediest_percent)
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break
    
            step += 1



        log.info("Greediest algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, model_selection, self.results.best_score))

        txt = "Best scheme according to Greediest algorithm, analysed with %s" % self.cfg.model_selection
        self.cfg.reporter.write_best_scheme(txt, self.results)
Exemplo n.º 12
0
    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(
            the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(self.alignment,
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(the_config.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything,
                                           self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path,
                                             'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug("didn't find tree at %s, making a new one" %
                          tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" %
                    tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme",
                    range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras, tree_scheme, self.threads)

                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(
                    os.path.dirname(topology_path),
                    "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path, topology_path,
                    the_config.datatype, the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                  self.tree_path)
Exemplo n.º 13
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log,
                                   "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim * dim) - dim)) / 2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes  # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s",
                          str(best_change))

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change,
                                                    subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged,
                    the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score

                log.info("Best scheme combines subsets: '%s' and '%s'" %
                         (best_pair[0].name, best_pair[1].name))

                log.info(
                    "The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection, np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair),
                                                    [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                  [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, the_config.model_selection,
                  self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 14
0
    def do_analysis(self):
        log.info("Performing relaxed clustering analysis")

        stop_at = self.cfg.cluster_percent * 0.01

        model_selection = self.cfg.model_selection
        partnum = len(self.cfg.partitions)

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, self.cfg.cluster_percent)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, self.cfg.cluster_percent)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(self.cfg, "start_scheme",
                                            start_description)
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)
        self.cfg.reporter.write_scheme_summary(self.results.best_scheme,
                                               self.results.best_result)

        # Start by remembering that we analysed the starting scheme
        subset_counter = 1
        step = 1
        while True:

            log.info("***Relaxed clustering algorithm step %d of %d***" %
                     (step, partnum - 1))
            name_prefix = "step_%d" % (step)

            # Get a list of all possible lumpings of the best_scheme, ordered
            # according to the clustering weights
            lumped_subsets = neighbour.get_ranked_clustered_subsets(
                start_scheme, self.cfg)

            # reduce the size of the lumped subsets to cluster_percent long
            cutoff = int(math.ceil(len(lumped_subsets) *
                                   stop_at))  #round up to stop zeros
            lumped_subsets = lumped_subsets[:cutoff]

            # Now analyse the lumped schemes
            lumpings_done = 0
            old_best_score = self.results.best_score

            for subset_grouping in lumped_subsets:
                scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1)
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f", self.cfg.model_selection,
                          (new_result.score - old_best_score))

                lumpings_done += 1

            if self.results.best_score != old_best_score:
                log.info(
                    "Analysed %.1f percent of the schemes for this step. The best "
                    "scheme changed the %s score by %.1f units.",
                    self.cfg.cluster_percent, self.cfg.model_selection,
                    (self.results.best_score - old_best_score))

                #write out the best scheme
                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info(
                    "Analysed %.1f percent of the schemes for this step and found no schemes "
                    "that improve the score, stopping",
                    self.cfg.cluster_percent)
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, model_selection,
                  self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
Exemplo n.º 15
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.partitions)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(self.cfg, "start_scheme",
                                            start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            # Get a list of all possible lumpings of the best_scheme
            lumpings = algorithm.lumpings(start_description)

            # Save the current best score we have in results
            old_best_score = self.results.best_score
            for lumped_description in lumpings:
                lumped_scheme = scheme.create_scheme(self.cfg, cur_s,
                                                     lumped_description)
                cur_s += 1
                # This is just checking to see if a scheme is any good, if it
                # is, we remember and write it later
                self.analyse_scheme(lumped_scheme)

            # Did out best score change (It ONLY gets better -- see in
            # results.py)
            if self.results.best_score == old_best_score:
                # It didn't, so we're done
                break

            # Let's look further. We use the description from our best scheme
            # (which will be the one that just changed in the last lumpings
            # iteration)
            start_description = self.results.best_result.scheme.description

            # Rename and record the best scheme for this step
            self.results.best_scheme.name = "step_%d" % step
            self.cfg.reporter.write_scheme_summary(self.results.best_scheme,
                                                   self.results.best_result)

            # If it's the scheme with everything equal, quit
            if len(set(start_description)) == 1:
                break

            # Go do the next round...
            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, self.cfg.model_selection,
                  self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
Exemplo n.º 16
0
    def do_analysis(self):
        # Copied and pasted from greedy analysis
        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)


        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        old_score = self.analyse_scheme(start_scheme)

        # Get first scheme
        best_scheme = start_scheme
        subset_index = 0
        all_subsets = list(best_scheme.subsets)
        processor = self.cfg.processor
        alignment_path = self.filtered_alignment_path
        tree_path = processor.make_tree_path(alignment_path)


        while subset_index < len(all_subsets):
            current_subset = all_subsets[subset_index]
            split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path)

            if split_subsets == 1:
                subset_index += 1

            else:
                # Take a copy
                updated_subsets = all_subsets[:]

                # Replace the current one with the split one
                # Google "slice assignments"
                # This list is the key to avoiding recursion. It expands to contain
                # all of the split subsets by replacing them with the split ones
                updated_subsets[subset_index:subset_index+1] = split_subsets

                test_scheme = scheme.Scheme(self.cfg, "Current Scheme", updated_subsets)

                try:
                    best_result = self.analyse_scheme(best_scheme)
                    new_result = self.analyse_scheme(test_scheme)

                    log.info("Current best score is: " + str(best_result))
                    log.info("Current new score is: " + str(new_result))
                    if new_result.score < best_result.score:
                        log.info("New score " + str(subset_index) + " is better and will be set to best score")
                        best_scheme = test_scheme

                        # Change this to the one with split subsets in it. Note that
                        # the subset_index now points a NEW subset, one that was split
                        all_subsets = updated_subsets
                    else:
                        # Move to the next subset in the all_subsets list
                        subset_index += 1

                # In PhyML or RAxML, it is likely because of no alignment patterns,
                # catch that and move to the next subset without splitting.
                except PhylogenyProgramError:
                    log.info("Phylogeny program generated an error so this subset was not split, see error above")
                    subset_index += 1
        # Now start the Greedy Analysis: need to figure out how to make it go through more
        # than one scheme...

        start_scheme = best_scheme
        partnum = len(start_scheme.subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)
        self.cfg.progress.begin(scheme_count, subset_count)
        start_description = range(partnum)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            old_best_score = self.results.best_score

            # Get an iterable of all possible pairs of subsets in best_scheme
            lumped_subsets = itertools.combinations(start_scheme.subsets, 2)

            for subset_grouping in lumped_subsets:
                scheme_name = cur_s
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f",
                          self.cfg.model_selection,
                          (new_result.score-old_best_score))

                cur_s += 1

            if self.results.best_score != old_best_score:
                log.info("Analysed all schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.",
                         self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed all schemes for this step and found no schemes "
                         "that improve the score, stopping")
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
Exemplo n.º 17
0
    def do_analysis(self):
        bic_score_list = []
        # Copied and pasted from greedy analysis
        partnum = len(self.cfg.user_subsets)
        self.cfg.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)


        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        old_score = self.analyse_scheme(start_scheme)


        # Get first scheme
        best_scheme = start_scheme
        subset_index = 0
        all_subsets = list(best_scheme.subsets)
        original_subset = all_subsets[0]

        processor = self.cfg.processor
        alignment_path = self.filtered_alignment_path
        tree_path = processor.make_tree_path(alignment_path)
        best_result = self.analyse_scheme(best_scheme)
        bic_score_list.append(best_result.score)
        log.info("Starting score is %s" % best_result.score)
        fabricated_subsets =[]
        step = 1
        num_ks = 2
        last_ks = 0

        # Grab CIs from pre-created file
        per_site_statistics = processor.get_CIs(self.cfg)
        likelihood_list = per_site_statistics
        original_subset.site_lnls_GTRG = likelihood_list

        per_site_stat_list = likelihood_list

        cap = False
        high_ks = 0

        while num_ks > high_ks:
            log.info("*** K-means bisection search step %i ***" % step)
            step += 1

            # Then we use k-means to split the alignment into 2 subsets and
            # create a new scheme, see if that improves the overall score, if
            # it does we move onto the next step
            new_subs = kmeans.kmeans_var_ks(self.cfg, original_subset, num_ks, per_site_stat_list)
            new_scheme = scheme.Scheme(self.cfg, str(step-1),
                new_subs)
            new_result = self.analyse_scheme(new_scheme)

            # Bisection search: with improvements, set num_ks to double. When
            # the BIC score doesn't improve, set num_ks into the middle
            # between the last one that improved the BIC and the current
            # number
            if new_result.score < best_result.score:
                log.info("Score improved with %s k's" % num_ks)
                log.info("New score is: %d" % new_result.score)
                best_result = new_result
                best_scheme = new_scheme
                last_ks = num_ks
                high_ks = num_ks
                if cap:
                    num_ks = (new_cap + num_ks)/2
                else:
                    num_ks = (num_ks*2)

            else:
                log.info("Didn't get better with %s k's, bisecting..." % num_ks)
                cap = True
                new_cap = num_ks
                num_ks = (last_ks + num_ks)/2

        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
Exemplo n.º 18
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(
                    log,
                    "*** Relaxed clustering algorithm step %d of up to %d ***"
                    % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" %
                         max_schemes)
                d_matrix = neighbour.get_distance_matrix(
                    subsets, the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(
                    math.ceil(max_schemes *
                              (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix < 0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(
                        c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged,
                        the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %
                             (best_pair[0].name, best_pair[1].name))
                    log.info("This improves the %s score by: %s",
                             the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(
                        c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                      [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score

                log.info(
                    "The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets), the_config.model_selection,
                    np.abs(best_change), self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, model_selection,
                  self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info(
                "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                % (self.results.best_scheme.name, model_selection,
                   self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
    def setup(self):

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        if len(start_scheme.subsets) > 1:
            log.error("The k-means algorithm is designed to analyse \
                the entire alignment at once. To use it, please define a \
                single data block that includes all of your sites, and \
                again.")
            raise AnalysisError

        site_max = sum([len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." % (the_config.min_subset_size, site_max))
            raise AnalysisError

        with logtools.indented(
                log, "**Analysing starting scheme (scheme %s)**" %
                start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger':
            try:
                from _tiger import TigerDNA
                the_config.TigerDNA = TigerDNA
            except:
                log.error("Couldn't find compiled tiger code.")
                log.error("You have selected kmeans and tiger \
                    rates. This is an unsupported option, if you still wish to use \
                    this option, you must compile the tiger code.")
                log.error(
                    "Once you compile the tiger code, this option will work. \
                    But please note that this is an \
                    unsupported option. For empirical work we recommend using \
                    entropy calculations for site rates, which is the default \
                    behaviour for the kmeans algorithm in PF2.")
                raise AnalysisError
        else:
            the_config.TigerDNA = None

        return start_result, start_scheme, tree_path
Exemplo n.º 20
0
    def setup(self):

        log.warning(
            "Warning as of April 2016: We have noticed that the kmeans \
            algorithm does not perform well on some simulated datasets. \
            We are working on investigating and addressing this \
            but in the mean time we suggest being very cautious about using \
            this algorithm. At the very least, you should try other approaches \
            (e.g. partitioning by locus), and investigate your answers carefully \
            (both the trees and the partitioning schemes). If you have any \
            questions, please get in touch on the google group.")

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        if len(start_scheme.subsets) > 1:
            log.error("The k-means algorithm is designed to analyse \
                the entire alignment at once. To use it, please define a \
                single data block that includes all of your sites, and \
                again.")
            raise AnalysisError

        site_max = sum([len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." % (the_config.min_subset_size, site_max))
            raise AnalysisError

        with logtools.indented(
                log, "**Analysing starting scheme (scheme %s)**" %
                start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology':
            try:
                from _tiger import TigerDNA
                the_config.TigerDNA = TigerDNA
            except:
                log.error("Couldn't find compiled tiger code.")
                log.error("You have selected kmeans and tiger \
                    rates. This is an unsupported option, if you still wish to use \
                    this option, you must compile the tiger code.")
                log.error(
                    "Once you compile the tiger code, this option will work. \
                    But please note that this is an \
                    unsupported option. For empirical work we recommend using \
                    entropy calculations for site rates, which is the default \
                    behaviour for the kmeans algorithm in PF2.")
                raise AnalysisError
        else:
            the_config.TigerDNA = None

        return start_result, start_scheme, tree_path
Exemplo n.º 21
0
    def do_analysis(self):
        """A greedy algorithm for heuristic partitioning searches"""
        log.info("Performing greedy analysis")
        models = self.cfg.models
        model_selection = self.cfg.model_selection
        partnum = len(self.cfg.partitions)

        self.total_scheme_num = submodels.count_greedy_schemes(partnum)
        log.info("This will result in a maximum of %s schemes being created", self.total_scheme_num)

        self.total_subset_num = submodels.count_greedy_subsets(partnum)
        log.info(
            "PartitionFinder will have to analyse a maximum of %d subsets of sites to complete this analysis"
            % (self.total_subset_num)
        )

        if self.total_subset_num > 10000:
            log.warning("%d is a lot of subsets, this might take a long time to analyse", self.total_subset_num)
            log.warning("Perhaps consider using a different search scheme instead (see Manual)")

        # clear any schemes that are currently loaded
        # TODO Not sure we need this...
        self.cfg.schemes.clear_schemes()

        # start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(self.cfg, 1, start_description)
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        result = self.analyse_scheme(start_scheme, models)

        def get_score(my_result):
            # TODO: this is bad. Should use self.cfg.model_selection, or write
            # a new model_selection for scheme.py
            if model_selection == "aic":
                score = my_result.aic
            elif model_selection == "aicc":
                score = my_result.aicc
            elif model_selection == "bic":
                score = my_result.bic
            else:
                log.error("Unrecognised model_selection variable '%s', please check" % (score))
                raise AnalysisError
            return score

        best_result = result
        best_score = get_score(result)

        step = 1
        cur_s = 2

        # now we try out all lumpings of the current scheme, to see if we can find a better one
        # and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            # get a list of all possible lumpings of the best_scheme
            lumpings = algorithm.lumpings(start_description)

            # we reset the counters as we go, for better user information
            self.total_scheme_num = len(lumpings)
            self.schemes_analysed = 0

            best_lumping_score = None
            for lumped_description in lumpings:
                lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description)
                cur_s += 1
                result = self.analyse_scheme(lumped_scheme, models)
                new_score = get_score(result)

                if best_lumping_score == None or new_score < best_lumping_score:
                    best_lumping_score = new_score
                    best_lumping_result = result
                    best_lumping_scheme = lumped_scheme
                    best_lumping_desc = lumped_description

            if best_lumping_score < best_score:
                best_scheme = best_lumping_scheme
                best_score = best_lumping_score
                best_result = best_lumping_result
                start_description = best_lumping_desc
                if len(set(best_lumping_desc)) == 1:  # then it's the scheme with everything equal, so quit
                    break
                step += 1

            else:
                break

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info(
            "Highest scoring scheme is scheme %s, with %s score of %.3f"
            % (best_result.scheme.name, model_selection, best_score)
        )

        self.best_result = best_result
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.partitions)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            # Get a list of all possible lumpings of the best_scheme
            lumpings = algorithm.lumpings(start_description)

            # Save the current best score we have in results
            old_best_score = self.results.best_score
            for lumped_description in lumpings:
                lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description)
                cur_s += 1
                # This is just checking to see if a scheme is any good, if it
                # is, we remember and write it later
                self.analyse_scheme(lumped_scheme)

            # Did out best score change (It ONLY gets better -- see in
            # results.py)
            if self.results.best_score == old_best_score:
                # It didn't, so we're done
                break

            # Let's look further. We use the description from our best scheme
            # (which will be the one that just changed in the last lumpings
            # iteration)
            start_description = self.results.best_result.scheme.description

            # Rename and record the best scheme for this step
            self.results.best_scheme.name = "step_%d" % step
            self.cfg.reporter.write_scheme_summary(
                self.results.best_scheme, self.results.best_result)

            # If it's the scheme with everything equal, quit
            if len(set(start_description)) == 1:
                break

            # Go do the next round...
            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, self.cfg.model_selection,
                  self.results.best_score))

        txt = "Best scheme according to Greedy algorithm, analysed with %s" % self.cfg.model_selection
        self.cfg.reporter.write_best_scheme(txt, self.results)
    def setup(self):

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            the_config, "start_scheme", start_description)

        if len(start_scheme.subsets)>1:
            log.error("The k-means algorithm is designed to analyse \
                the entire alignment at once. To use it, please define a \
                single data block that includes all of your sites, and \
                again."
                )
            raise AnalysisError

        site_max = sum([ len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." %(the_config.min_subset_size, site_max)
                )
            raise AnalysisError
            

        with logtools.indented(log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger':
            try:
                from _tiger import TigerDNA
                the_config.TigerDNA = TigerDNA
            except:
                log.error("Couldn't find compiled tiger code.")
                log.error("You have selected kmeans and tiger \
                    rates. This is an unsupported option, if you still wish to use \
                    this option, you must compile the tiger code.")
                log.error("Once you compile the tiger code, this option will work. \
                    But please note that this is an \
                    unsupported option. For empirical work we recommend using \
                    entropy calculations for site rates, which is the default \
                    behaviour for the kmeans algorithm in PF2.")
                raise AnalysisError
        else:
            the_config.TigerDNA = None


        return start_result, start_scheme, tree_path
Exemplo n.º 24
0
    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(
            self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(
            the_config.start_tree_path,  'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(
            the_config.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" % tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme", range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads)
                
                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path,
                    topology_path,
                    the_config.datatype,
                    the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                 self.tree_path)
Exemplo n.º 25
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            old_best_score = self.results.best_score

            # Get an iterable of all possible pairs of subsets in best_scheme
            lumped_subsets = itertools.combinations(start_scheme.subsets, 2)

            for subset_grouping in lumped_subsets:
                scheme_name = cur_s
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f",
                          self.cfg.model_selection,
                          (new_result.score-old_best_score))

                cur_s += 1

            if self.results.best_score != old_best_score:
                log.info("Analysed all schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.",
                         self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed all schemes for this step and found no schemes "
                         "that improve the score, stopping")
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)