Exemplo n.º 1
0
def create_scheme(cfg, scheme_name, scheme_description):
    """
    Generate a single scheme given a list of numbers that represent the
    indexes of the partitions e.g. [0,1,2,3,4,5,6,7]
    """

    subset_count = len(cfg.user_subsets)

    # Check that the correct number of items are in the list
    if len(scheme_description) != subset_count:
        log.error("There's a problem with the description of scheme %s" %
                  scheme_name)
        raise SchemeError

    # Now generate the pattern
    subs = {}
    # We use the numbers returned to group the different subsets
    for sub_index, grouping in enumerate(scheme_description):
        insub = subs.setdefault(grouping, [])
        insub.append(sub_index)

    # We now have what we need to create a subset. Each entry will have a
    # set of values which are the index for the partition
    created_subsets = []
    for sub_indexes in subs.values():
        subs_to_merge = [cfg.user_subsets[i] for i in sub_indexes]
        sub = subset_ops.merge_subsets(subs_to_merge)
        created_subsets.append(sub)

    return Scheme(cfg, str(scheme_name), created_subsets, description=scheme_description)
Exemplo n.º 2
0
def generate_all_schemes(cfg):
    """
    Convert the abstract schema given by the algorithm into subsets
    """

    log.info("Generating all possible schemes for the partitions...")

    subset_count = len(cfg.user_subsets)

    # Now generate the pattern for this many partitions
    all_schemes = submodels.get_submodels(subset_count)
    scheme_name = 1
    scheme_list = []
    for scheme in all_schemes:
        subs = {}
        # We use the numbers returned to group the different subsets
        for sub_index, grouping in enumerate(scheme):
            insub = subs.setdefault(grouping, [])
            insub.append(sub_index)
        # We now have what we need to create a subset. Each entry will have a
        # set of values which are the index for the partition
        created_subsets = []
        for sub_indexes in subs.values():
            sub = subset_ops.merge_subsets(
                [cfg.user_subsets[i] for i in sub_indexes])
            created_subsets.append(sub)

        scheme_list.append(Scheme(cfg, str(scheme_name), created_subsets))

        log.debug("Created scheme %d of %d" % (scheme_name, len(all_schemes)))

        scheme_name += 1

    return scheme_list
Exemplo n.º 3
0
def generate_all_schemes(cfg):
    """
    Convert the abstract schema given by the algorithm into subsets
    """

    log.info("Generating all possible schemes for the partitions...")

    subset_count = len(cfg.user_subsets)

    # Now generate the pattern for this many partitions
    all_schemes = submodels.get_submodels(subset_count)
    scheme_name = 1
    scheme_list = []
    for scheme in all_schemes:
        subs = {}
        # We use the numbers returned to group the different subsets
        for sub_index, grouping in enumerate(scheme):
            insub = subs.setdefault(grouping, [])
            insub.append(sub_index)
        # We now have what we need to create a subset. Each entry will have a
        # set of values which are the index for the partition
        created_subsets = []
        for sub_indexes in subs.values():
            sub = subset_ops.merge_subsets(
                [cfg.user_subsets[i] for i in sub_indexes])
            created_subsets.append(sub)

        scheme_list.append(Scheme(cfg, str(scheme_name), created_subsets))

        log.debug("Created scheme %d of %d" % (scheme_name, len(all_schemes)))

        scheme_name += 1

    return scheme_list
Exemplo n.º 4
0
def create_scheme(cfg, scheme_name, scheme_description):
    """
    Generate a single scheme given a list of numbers that represent the
    indexes of the partitions e.g. [0,1,2,3,4,5,6,7]
    """

    subset_count = len(cfg.user_subsets)

    # Check that the correct number of items are in the list
    if len(scheme_description) != subset_count:
        log.error("There's a problem with the description of scheme %s" %
                  scheme_name)
        raise SchemeError

    # Now generate the pattern
    subs = {}
    # We use the numbers returned to group the different subsets
    for sub_index, grouping in enumerate(scheme_description):
        insub = subs.setdefault(grouping, [])
        insub.append(sub_index)

    # We now have what we need to create a subset. Each entry will have a
    # set of values which are the index for the partition
    created_subsets = []
    for sub_indexes in subs.values():
        subs_to_merge = [cfg.user_subsets[i] for i in sub_indexes]
        sub = subset_ops.merge_subsets(subs_to_merge)
        created_subsets.append(sub)

    return Scheme(cfg,
                  str(scheme_name),
                  created_subsets,
                  description=scheme_description)
Exemplo n.º 5
0
    def clean_scheme(self, start_scheme):
        # Here we look for and fix up subsets that are too small or don't have all states
        keep_going = 1
        merges = 0
        if keep_going == 1:
            with logtools.indented(
                    log,
                    "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***"
                    % start_scheme.name):
                while keep_going > 0:

                    subsets = [s for s in start_scheme.subsets]

                    # sort the subsets, to keep results consistent over re-runs
                    subsets.sort(key=lambda x: 1.0 / float(len(x.columns)))

                    # run through all subsets
                    for i, sub in enumerate(subsets):
                        found = 0
                        state_problems = self.alignment.check_state_probs(
                            sub, the_config)

                        if (len(sub.columns) < the_config.min_subset_size
                                or state_problems == True):

                            # merge that subset with nearest neighbour
                            new_pair = neighbour.get_closest_subset(
                                sub, subsets, the_config)

                            log.info(
                                "Subset '%s' will be merged with subset '%s'" %
                                (new_pair[0].name, new_pair[1].name))
                            new_pair_merged = subset_ops.merge_subsets(
                                new_pair)
                            start_scheme = neighbour.make_clustered_scheme(
                                start_scheme, "cleaned_scheme", new_pair,
                                new_pair_merged, the_config)
                            the_config.progress.begin(1, 1)
                            self.analyse_scheme(start_scheme)
                            subsets = [s for s in start_scheme.subsets]
                            merges = merges + 1
                            found = 1
                            break

                    # if we got to here, there were no subsets to merge
                    if found == 0:
                        keep_going = 0

                    if len(subsets) == 1:
                        log.error(
                            "The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again"
                        )
                        raise AnalysisError

                log.info(
                    "%d subsets merged because of --min-subset-size and/or --all-states settings"
                    % merges)
        return (start_scheme)
Exemplo n.º 6
0
    def define_subset_grouping(self, text, loc, subset_def):
        """These define initial groupings that users think are useful
        """
        try:
            # Get the partitions from the names
            subsets = [self.cfg.user_subsets_by_name[nm] for nm in subset_def[0]]

            # Keep a running list of these till we define the schema below
            self.current_subsets.append(subset_ops.merge_subsets(subsets))
        except subset.SubsetError:
            raise ParserError(text, loc, "Error creating subset...")
Exemplo n.º 7
0
    def define_subset_grouping(self, text, loc, subset_def):
        """These define initial groupings that users think are useful
        """
        try:
            # Get the partitions from the names
            subsets = [self.cfg.user_subsets_by_name[nm] for nm in subset_def[0]]

            # Keep a running list of these till we define the schema below
            self.current_subsets.append(subset_ops.merge_subsets(subsets))
        except subset.SubsetError:
            raise ParserError(text, loc, "Error creating subset...")
def make_clustered_scheme(start_scheme, scheme_name, subsets_to_cluster, cfg):

    # 1. Create a new subset that merges the subsets_to_cluster
    merged_sub = subset_ops.merge_subsets(subsets_to_cluster)

    # 2. Then we define a new scheme with those merged subsets
    new_subsets = start_scheme.subsets - set(subsets_to_cluster)
    new_subsets.add(merged_sub)

    #3. Create the clustered scheme
    final_scheme = scheme.Scheme(cfg, str(scheme_name), new_subsets)

    return final_scheme
Exemplo n.º 9
0
    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(self.cfg.user_subsets)
        self.filtered_alignment = SubsetAlignment(
            self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(
            self.cfg.start_tree_path,  'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        # TODO: This checking should still be done...
        # self.cfg.partitions.check_against_alignment(self.alignment)
        # self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(
            self.cfg.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = self.cfg.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(self.cfg.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy')
                self.cfg.processor.dupfile(user_path, topology_path)
            else:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = self.cfg.processor.make_topology(
                    self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras)

            # Now estimate branch lengths
            tree_path = self.cfg.processor.make_branch_lengths(
                self.filtered_alignment_path,
                topology_path,
                self.cfg.datatype,
                self.cfg.cmdline_extras)

        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s", self.tree_path)
Exemplo n.º 10
0
    def clean_scheme(self, start_scheme):
        # Here we look for and fix up subsets that are too small or don't have all states
        keep_going = 1
        merges = 0
        if keep_going == 1:
            with logtools.indented(log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" %start_scheme.name):
                while keep_going > 0:

                    subsets = [s for s in start_scheme.subsets]

                    # sort the subsets, to keep results consistent over re-runs
                    subsets.sort(key = lambda x: 1.0/float(len(x.columns)))

                    # run through all subsets
                    for i, sub in enumerate(subsets):
                        found = 0
                        state_problems = self.alignment.check_state_probs(sub, the_config)

                        if  (
                                len(sub.columns) < the_config.min_subset_size or
                                state_problems == True
                            ):

                            # merge that subset with nearest neighbour
                            new_pair = neighbour.get_closest_subset(sub, subsets, the_config)

                            log.info("Subset '%s' will be merged with subset '%s'" %(new_pair[0].name, new_pair[1].name))
                            new_pair_merged = subset_ops.merge_subsets(new_pair)
                            start_scheme = neighbour.make_clustered_scheme(
                                    start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config)
                            the_config.progress.begin(1, 1)
                            self.analyse_scheme(start_scheme)
                            subsets = [s for s in start_scheme.subsets]
                            merges = merges + 1
                            found = 1
                            break


                    # if we got to here, there were no subsets to merge
                    if found == 0:
                        keep_going = 0

                    if len(subsets) == 1:
                        log.error("The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again")
                        raise AnalysisError

                log.info("%d subsets merged because of --min-subset-size and/or --all-states settings" % merges)
        return(start_scheme)
Exemplo n.º 11
0
def model_to_scheme(model, scheme_name, cfg):
    """Turn a model definition e.g. [0, 1, 2, 3, 4] into a scheme"""
    subs = {}
    # We use the numbers returned to group the different subsets
    for sub_index, grouping in enumerate(model):
        insub = subs.setdefault(grouping, [])
        insub.append(sub_index)

    # We now have what we need to create a subset. Each entry will have a
    # set of values which are the index for the partition
    created_subsets = []
    for sub_indexes in subs.values():
        subs_to_merge = [cfg.user_subsets[i] for i in sub_indexes]
        sub = subset_ops.merge_subsets(subs_to_merge)
        created_subsets.append(sub)

    return Scheme(cfg, str(scheme_name), created_subsets)
Exemplo n.º 12
0
def model_to_scheme(model, scheme_name, cfg):
    """Turn a model definition e.g. [0, 1, 2, 3, 4] into a scheme"""
    subs = {}
    # We use the numbers returned to group the different subsets
    for sub_index, grouping in enumerate(model):
        insub = subs.setdefault(grouping, [])
        insub.append(sub_index)

    # We now have what we need to create a subset. Each entry will have a
    # set of values which are the index for the partition
    created_subsets = []
    for sub_indexes in subs.values():
        subs_to_merge = [cfg.user_subsets[i] for i in sub_indexes]
        sub = subset_ops.merge_subsets(subs_to_merge)
        created_subsets.append(sub)

    return Scheme(cfg, str(scheme_name), created_subsets)
Exemplo n.º 13
0
def get_nearest_neighbour_scheme(start_scheme, scheme_name, cfg):
    """
    The idea here is to take a scheme, and perform some analyses to find a
    neighbouring scheme, where the neighbour has one less subset than the
    current scheme.  Really this is just progressive clustering, but specified
    to work well with PartitionFinder
    """

    # we use [0] becuase the function returns a ranked list of lists of length 1
    subsets = [s for s in start_scheme.subsets]
    closest_subsets = get_N_closest_subsets(subsets, cfg, 1)[0]

    merged_sub = subset_ops.merge_subsets(closest_subsets)

    scheme = make_clustered_scheme(start_scheme, scheme_name, closest_subsets,
                                   merged_sub, cfg)

    return scheme
Exemplo n.º 14
0
def get_nearest_neighbour_scheme(start_scheme, scheme_name, cfg):
    """
    The idea here is to take a scheme, and perform some analyses to find a
    neighbouring scheme, where the neighbour has one less subset than the
    current scheme.  Really this is just progressive clustering, but specified
    to work well with PartitionFinder
    """

    # we use [0] becuase the function returns a ranked list of lists of length 1
    subsets = [s for s in start_scheme.subsets]
    closest_subsets = get_N_closest_subsets(subsets, cfg, 1)[0]

    merged_sub = subset_ops.merge_subsets(closest_subsets) 

    scheme = make_clustered_scheme(
        start_scheme, scheme_name, closest_subsets, merged_sub, cfg)

    return scheme
Exemplo n.º 15
0
    def reassign_invariant_sites(self, subsets):

        #TODO add a skip:
        #if(len(subsets)==1):
        #   return(subsets)

        # get entropies for whole alignment for this subset
        onesub = subset_ops.merge_subsets(subsets)
        entropies = entropy.sitewise_entropies(
            SubsetAlignment(self.alignment, onesub))

        # find nearest site for each invariant site
        # replacements is a dict of: key: invariant col; value: replacement col,
        # e.g.
        # {512: 513, 514: 513, 515: 513, 516: 517}
        replacements = entropy.get_replacement_sites(entropies, onesub.columns)

        # now make a dict of the CURRENT subsets: key: site; value: subset
        sch_dict = {}
        for i, sub in enumerate(subsets):
            for site in sub.columns:
                sch_dict[site] = i

        # then reassign the sites as necessary based on replacements
        for r in replacements:
            sch_dict[r] = sch_dict[replacements[r]]

        # now build subsets according to the new sites
        sub_dict = {}  # this gives us the subsets to build
        for k, v in sch_dict.iteritems():
            sub_dict.setdefault(v, []).append(k)

        new_subsets = []
        for s in sub_dict:
            n = Subset(the_config, set(sub_dict[s]))
            new_subsets.append(n)

        return (new_subsets)
Exemplo n.º 16
0
    def reassign_invariant_sites(self, subsets):

        #TODO add a skip:
        #if(len(subsets)==1):
        #   return(subsets)

        # get entropies for whole alignment for this subset
        onesub = subset_ops.merge_subsets(subsets)
        entropies = entropy.sitewise_entropies(SubsetAlignment(self.alignment, onesub))

        # find nearest site for each invariant site
        # replacements is a dict of: key: invariant col; value: replacement col,
        # e.g.
        # {512: 513, 514: 513, 515: 513, 516: 517}
        replacements = entropy.get_replacement_sites(entropies, onesub.columns)

        # now make a dict of the CURRENT subsets: key: site; value: subset
        sch_dict = {}
        for i, sub in enumerate(subsets):
            for site in sub.columns:
                sch_dict[site] = i

        # then reassign the sites as necessary based on replacements
        for r in replacements:
            sch_dict[r] = sch_dict[replacements[r]]

        # now build subsets according to the new sites
        sub_dict = {} # this gives us the subsets to build
        for k, v in sch_dict.iteritems():
            sub_dict.setdefault(v, []).append(k)

        new_subsets = []
        for s in sub_dict:
            n = Subset(the_config, set(sub_dict[s]))
            new_subsets.append(n)

        return(new_subsets)
Exemplo n.º 17
0
    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(
            the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(self.alignment,
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(the_config.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything,
                                           self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path,
                                             'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug("didn't find tree at %s, making a new one" %
                          tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" %
                    tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme",
                    range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras, tree_scheme, self.threads)

                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(
                    os.path.dirname(topology_path),
                    "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path, topology_path,
                    the_config.datatype, the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                  self.tree_path)
Exemplo n.º 18
0
    def finalise_fabrication(self, start_subsets, step):

        fabricated_subsets = []
        for s in start_subsets:

            # here we put a sensible lower limit on the size of subsets
            if len(s.columns) < the_config.min_subset_size:
                s.fabricated = True
                log.debug("Subset %s with only %d sites found" %
                          (s.subset_id, len(s.columns)))

            # here we can test if the alignment has all states:
            state_probs = self.alignment.check_state_probs(s, the_config)
            if state_probs:
                s.fabricated = True
                log.debug(
                    "Subset %s does not have all states in the alignment",
                    s.subset_id)

            if s.fabricated:
                fabricated_subsets.append(s)
                log.debug("added %s to fabricated subset", s.name)

        if fabricated_subsets:
            with logtools.indented(log, "Finalising partitioning scheme"):
                log.debug("There are %d/%d fabricated subsets" %
                          (len(fabricated_subsets), len(start_subsets)))

                i = 1
                while fabricated_subsets:

                    all_subs = start_subsets

                    # occasionally subsets with all value == 0.0 are given a
                    # centroid of None by scikit-learn. The true entropy here
                    # is 0.0 for all sites, so the true centroid is 0.0
                    for s in all_subs:
                        if s.centroid == None:
                            s.centroid = [0.0]
                            log.debug("Fixed a subset with a centroid of None")
                            log.debug("The subset has %d columns" %
                                      len(s.columns))

                    s = fabricated_subsets.pop(0)

                    log.debug("Working on fabricated subset %s with %d sites" %
                              (s.subset_id, len(s.columns)))
                    log.info("Finalising subset %d", i)
                    i = i + 1

                    all_subs.remove(s)

                    centroid = s.centroid

                    best_match = None

                    # get closest subset to s
                    for sub in all_subs:

                        centroid_array = [sub.centroid, centroid]

                        euclid_dist = spatial.distance.pdist(centroid_array)

                        if euclid_dist < best_match or best_match is None:
                            best_match = euclid_dist
                            closest_sub = sub

                    # join s with closest_sub to make joined_sub
                    merged_sub = subset_ops.merge_subsets([s, closest_sub])

                    # remove closest sub
                    all_subs.remove(closest_sub)

                    # and if closest_sub was fabricated too, we remove it here
                    if fabricated_subsets.count(closest_sub):
                        fabricated_subsets.remove(closest_sub)

                    # analyse joined sub
                    self.analyse_list_of_subsets([merged_sub])

                    # here we put a sensible lower limit on the size of subsets
                    if len(merged_sub.columns) < the_config.min_subset_size:
                        merged_sub.fabricated = True

                    # if joined has to be fabricated, add to fabricated list
                    if merged_sub.fabricated:
                        fabricated_subsets.append(merged_sub)

                    all_subs.append(merged_sub)
        else:
            all_subs = start_subsets

        # now build a scheme from start_subs, and it should work
        final_scheme = scheme.Scheme(the_config, "final_scheme", all_subs)

        # return final scheme
        return final_scheme
Exemplo n.º 19
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(
                    log,
                    "*** Relaxed clustering algorithm step %d of up to %d ***"
                    % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" %
                         max_schemes)
                d_matrix = neighbour.get_distance_matrix(
                    subsets, the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(
                    math.ceil(max_schemes *
                              (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix < 0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(
                        c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged,
                        the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %
                             (best_pair[0].name, best_pair[1].name))
                    log.info("This improves the %s score by: %s",
                             the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(
                        c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                      [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score

                log.info(
                    "The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets), the_config.model_selection,
                    np.abs(best_change), self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, model_selection,
                  self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info(
                "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                % (self.results.best_scheme.name, model_selection,
                   self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 20
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log,
                                   "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim * dim) - dim)) / 2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes  # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s",
                          str(best_change))

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change,
                                                    subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged,
                    the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score

                log.info("Best scheme combines subsets: '%s' and '%s'" %
                         (best_pair[0].name, best_pair[1].name))

                log.info(
                    "The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection, np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair),
                                                    [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                  [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, the_config.model_selection,
                  self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 21
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        if the_config.cluster_max == -987654321:
            the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))])
            log.info("Set rcluster-max to %d" %the_config.cluster_max) 

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)



        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***"
                % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" % max_schemes)
                d_matrix = neighbour.get_distance_matrix(subsets,
                    the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix<0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged, the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
                    log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change


                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score


                log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets),
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score


                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)


                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, model_selection,
                    self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                     % (self.results.best_scheme.name, model_selection,
                        self.results.best_score))


        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 22
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log, "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim*dim)-dim))/2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)


                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s", str(best_change))

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged, the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score


                log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))


                log.info("The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, the_config.model_selection,
                    self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Exemplo n.º 23
0
    def finalise_fabrication(self, start_subsets, step):

        fabricated_subsets = []
        for s in start_subsets:

            # here we put a sensible lower limit on the size of subsets
            if len(s.columns) < the_config.min_subset_size:
                s.fabricated = True
                log.debug("Subset %s with only %d sites found" %(s.subset_id, len(s.columns)))

            # here we can test if the alignment has all states:
            state_probs = self.alignment.check_state_probs(s, the_config)
            if state_probs:
                s.fabricated = True
                log.debug("Subset %s does not have all states in the alignment", s.subset_id)

            if s.fabricated:
                fabricated_subsets.append(s)
                log.debug("added %s to fabricated subset", s.name)

        if fabricated_subsets:
            with logtools.indented(log, "Finalising partitioning scheme"):
                log.debug("There are %d/%d fabricated subsets"
                          % (len(fabricated_subsets), len(start_subsets)))

                i = 1
                while fabricated_subsets:

                    all_subs = start_subsets

                    # occasionally subsets with all value == 0.0 are given a
                    # centroid of None by scikit-learn. The true entropy here
                    # is 0.0 for all sites, so the true centroid is 0.0
                    for s in all_subs:
                        if s.centroid == None:
                            s.centroid = [0.0]
                            log.debug("Fixed a subset with a centroid of None")
                            log.debug("The subset has %d columns" % len(s.columns))

                    s = fabricated_subsets.pop(0)

                    log.debug("Working on fabricated subset %s with %d sites" %(s.subset_id, len(s.columns)))
                    log.info("Finalising subset %d", i)
                    i = i+1

                    all_subs.remove(s)

                    centroid = s.centroid

                    best_match = None

                    # get closest subset to s
                    for sub in all_subs:

                        centroid_array = [sub.centroid, centroid]

                        euclid_dist = spatial.distance.pdist(centroid_array)

                        if euclid_dist < best_match or best_match is None:
                            best_match = euclid_dist
                            closest_sub = sub

                    # join s with closest_sub to make joined_sub
                    merged_sub = subset_ops.merge_subsets([s, closest_sub])

                    # remove closest sub
                    all_subs.remove(closest_sub)

                    # and if closest_sub was fabricated too, we remove it here
                    if fabricated_subsets.count(closest_sub):
                        fabricated_subsets.remove(closest_sub)

                    # analyse joined sub
                    self.analyse_list_of_subsets([merged_sub])

                    # here we put a sensible lower limit on the size of subsets
                    if len(merged_sub.columns)<the_config.min_subset_size:
                        merged_sub.fabricated = True

                    # if joined has to be fabricated, add to fabricated list
                    if merged_sub.fabricated:
                        fabricated_subsets.append(merged_sub)

                    all_subs.append(merged_sub)
        else:
            all_subs = start_subsets

        # now build a scheme from start_subs, and it should work
        final_scheme = scheme.Scheme(the_config, "final_scheme", all_subs)

        # return final scheme
        return final_scheme
Exemplo n.º 24
0
    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(
            self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(
            the_config.start_tree_path,  'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(
            the_config.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" % tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme", range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads)
                
                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path,
                    topology_path,
                    the_config.datatype,
                    the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                 self.tree_path)