예제 #1
0
def run_analytical_fit(boot_collection,
                       ref_collection,
                       ref_coords,
                       task=_fast_geo,
                       rooted=False,
                       **kwargs):
    fit = np.empty((len(boot_collection), ref_coords.shape[1]))
    if ISPY3:
        query_trees = [
            PhyloTree(tree.encode(), rooted) for tree in boot_collection.trees
        ]
        ref_trees = [
            PhyloTree(tree.encode(), rooted) for tree in ref_collection.trees
        ]
    else:
        query_trees = [
            PhyloTree(tree, rooted) for tree in boot_collection.trees
        ]
        ref_trees = [PhyloTree(tree, rooted) for tree in ref_collection.trees]
    for i, tree in enumerate(query_trees):
        ref_dists = np.array(
            [task(tree, ref_tree, False) for ref_tree in ref_trees])
        aft = AnalyticalFit(ref_coords.values, **kwargs)
        fit[i] = aft.fit(ref_dists)
    return fit
예제 #2
0
def run_out_of_sample_mds(boot_collection,
                          ref_collection,
                          ref_distance_matrix,
                          index,
                          dimensions,
                          task=_fast_geo,
                          rooted=False,
                          **kwargs):
    """
    index = index of the locus the bootstrap sample corresponds to - only important if
            using recalc=True in kwargs
    """
    fit = np.empty((len(boot_collection), dimensions))
    if ISPY3:
        query_trees = [
            PhyloTree(tree.encode(), rooted) for tree in boot_collection.trees
        ]
        ref_trees = [
            PhyloTree(tree.encode(), rooted) for tree in ref_collection.trees
        ]
    else:
        query_trees = [
            PhyloTree(tree, rooted) for tree in boot_collection.trees
        ]
        ref_trees = [PhyloTree(tree, rooted) for tree in ref_collection.trees]
    for i, tree in enumerate(query_trees):
        distvec = np.array(
            [task(tree, ref_tree, False) for ref_tree in ref_trees])
        oos = OutOfSampleMDS(ref_distance_matrix)
        fit[i] = oos.fit(index, distvec, dimensions=dimensions, **kwargs)
    return fit
예제 #3
0
def run_optimise_bootstrap_coords(boot_collection,
                                  ref_collection,
                                  ref_coords,
                                  task=_fast_geo,
                                  rooted=False,
                                  **kwargs):
    fit = np.empty((len(boot_collection), ref_coords.shape[1]))
    if ISPY3:
        query_trees = [
            PhyloTree(tree.encode(), rooted) for tree in boot_collection.trees
        ]
        ref_trees = [
            PhyloTree(tree.encode(), rooted) for tree in ref_collection.trees
        ]
    else:
        query_trees = [
            PhyloTree(tree, rooted) for tree in boot_collection.trees
        ]
        ref_trees = [PhyloTree(tree, rooted) for tree in ref_collection.trees]
    for i, tree in enumerate(query_trees):
        ref_dists = np.array(
            [task(tree, ref_tree, False) for ref_tree in ref_trees])
        opt = OptimiseDistanceFit(ref_coords.values, ref_dists)
        fit[i] = opt.newton(**kwargs)
    return fit
예제 #4
0
 def get_inter_tree_distances(self,
                              metric,
                              jobhandler=default_jobhandler,
                              normalise=False,
                              batchsize=1):
     """ Generate a distance matrix from a fully-populated Collection """
     metrics = {
         'euc': tasks.EuclideanTreeDistance,
         'geo': tasks.GeodesicTreeDistance,
         'rf': tasks.RobinsonFouldsTreeDistance,
         'wrf': tasks.WeightedRobinsonFouldsTreeDistance,
         'fasteuc': tasks.EqualLeafSetEuclideanTreeDistance,
         'fastgeo': tasks.EqualLeafSetGeodesicTreeDistance,
         'fastrf': tasks.EqualLeafSetRobinsonFouldsTreeDistance,
         'fastwrf': tasks.EqualLeafSetWeightedRobinsonFouldsTreeDistance
     }
     optioncheck(metric, list(metrics.keys()))
     task_interface = metrics[metric]()
     if metric.startswith('fast'):
         trees = (PhyloTree(newick, False) for newick in self.trees)
     else:
         trees = self.trees
     args = task_interface.scrape_args(trees, normalise)
     logger.debug('{}'.format(args))
     msg = task_interface.name
     array = jobhandler(task_interface.get_task(), args, msg, batchsize)
     return DistanceMatrix.from_array(squareform(array), self.names)
예제 #5
0
 def phylotree(self):
     """
     Gets the c++ PhyloTree object corresponding to this tree.
     Should be canonically the same - we set a _dirty flag if the Python version of the
     tree has changed since construction. If the flag is set then we reconstruct
     the c++ PhyloTree
     :return: PhyloTree instance
     """
     if not self._phylotree or self._dirty:
         self._phylotree = PhyloTree(self.newick, self.rooted)
         self._dirty = False
     return self._phylotree
예제 #6
0
 def get_inter_tree_distances(self, metric, jobhandler=default_jobhandler,
                              normalise=False, min_overlap=4, overlap_fail_value=0,
                              batchsize=1, show_progress=True):
     """ Generate a distance matrix from a fully-populated Collection.
         Can silence progressbars with show_progress=False option
     :param metric: str. Tree distance metric to use. Choice of 'euc', 'geo', 'rf', 'wrf'.
     :param jobhandler: treeCl.Jobhandler. Choice of SequentialJobHandler, ThreadpoolJobHandler, or
         ProcesspoolJobHandler.
     :param normalise:  Bool. Whether to normalise the tree distance to the size of the leaf set.
     :param min_overlap: int. Trees with fewer leaves in common than this threshold will not have their distance
         calculated, but instead the distance returned will be the value in `overlap_fail_value`.
     :param overlap_fail_value: Any. The distance between trees with fewer leaves in common than `min_overlap`
         is set to this value.
     :param batchsize: int. Number of jobs to process in a batch when using a ProcesspoolJobHandler or a
         ThreadpoolJobHandler.
     :return: treeCl.DistanceMatrix.
     """
     metrics = {'euc': tasks.EuclideanTreeDistance,
                'geo': tasks.GeodesicTreeDistance,
                'rf': tasks.RobinsonFouldsTreeDistance,
                'wrf': tasks.WeightedRobinsonFouldsTreeDistance,
                'fasteuc': tasks.EqualLeafSetEuclideanTreeDistance,
                'fastgeo': tasks.EqualLeafSetGeodesicTreeDistance,
                'fastrf': tasks.EqualLeafSetRobinsonFouldsTreeDistance,
                'fastwrf': tasks.EqualLeafSetWeightedRobinsonFouldsTreeDistance}
     optioncheck(metric, list(metrics.keys()))
     task_interface = metrics[metric]()
     if metric.startswith('fast'):
         trees = (PhyloTree(newick, False) for newick in self.trees)
     else:
         trees = self.trees
     args = task_interface.scrape_args(trees, normalise, min_overlap, overlap_fail_value)
     logger.debug('{}'.format(args))
     msg = task_interface.name if show_progress else ''
     array = jobhandler(task_interface.get_task(), args, msg, batchsize, nargs=binom_coeff(len(trees)))
     return DistanceMatrix.from_array(squareform(array), self.names)