def run_analytical_fit(boot_collection, ref_collection, ref_coords, task=_fast_geo, rooted=False, **kwargs): fit = np.empty((len(boot_collection), ref_coords.shape[1])) if ISPY3: query_trees = [ PhyloTree(tree.encode(), rooted) for tree in boot_collection.trees ] ref_trees = [ PhyloTree(tree.encode(), rooted) for tree in ref_collection.trees ] else: query_trees = [ PhyloTree(tree, rooted) for tree in boot_collection.trees ] ref_trees = [PhyloTree(tree, rooted) for tree in ref_collection.trees] for i, tree in enumerate(query_trees): ref_dists = np.array( [task(tree, ref_tree, False) for ref_tree in ref_trees]) aft = AnalyticalFit(ref_coords.values, **kwargs) fit[i] = aft.fit(ref_dists) return fit
def run_out_of_sample_mds(boot_collection, ref_collection, ref_distance_matrix, index, dimensions, task=_fast_geo, rooted=False, **kwargs): """ index = index of the locus the bootstrap sample corresponds to - only important if using recalc=True in kwargs """ fit = np.empty((len(boot_collection), dimensions)) if ISPY3: query_trees = [ PhyloTree(tree.encode(), rooted) for tree in boot_collection.trees ] ref_trees = [ PhyloTree(tree.encode(), rooted) for tree in ref_collection.trees ] else: query_trees = [ PhyloTree(tree, rooted) for tree in boot_collection.trees ] ref_trees = [PhyloTree(tree, rooted) for tree in ref_collection.trees] for i, tree in enumerate(query_trees): distvec = np.array( [task(tree, ref_tree, False) for ref_tree in ref_trees]) oos = OutOfSampleMDS(ref_distance_matrix) fit[i] = oos.fit(index, distvec, dimensions=dimensions, **kwargs) return fit
def run_optimise_bootstrap_coords(boot_collection, ref_collection, ref_coords, task=_fast_geo, rooted=False, **kwargs): fit = np.empty((len(boot_collection), ref_coords.shape[1])) if ISPY3: query_trees = [ PhyloTree(tree.encode(), rooted) for tree in boot_collection.trees ] ref_trees = [ PhyloTree(tree.encode(), rooted) for tree in ref_collection.trees ] else: query_trees = [ PhyloTree(tree, rooted) for tree in boot_collection.trees ] ref_trees = [PhyloTree(tree, rooted) for tree in ref_collection.trees] for i, tree in enumerate(query_trees): ref_dists = np.array( [task(tree, ref_tree, False) for ref_tree in ref_trees]) opt = OptimiseDistanceFit(ref_coords.values, ref_dists) fit[i] = opt.newton(**kwargs) return fit
def get_inter_tree_distances(self, metric, jobhandler=default_jobhandler, normalise=False, batchsize=1): """ Generate a distance matrix from a fully-populated Collection """ metrics = { 'euc': tasks.EuclideanTreeDistance, 'geo': tasks.GeodesicTreeDistance, 'rf': tasks.RobinsonFouldsTreeDistance, 'wrf': tasks.WeightedRobinsonFouldsTreeDistance, 'fasteuc': tasks.EqualLeafSetEuclideanTreeDistance, 'fastgeo': tasks.EqualLeafSetGeodesicTreeDistance, 'fastrf': tasks.EqualLeafSetRobinsonFouldsTreeDistance, 'fastwrf': tasks.EqualLeafSetWeightedRobinsonFouldsTreeDistance } optioncheck(metric, list(metrics.keys())) task_interface = metrics[metric]() if metric.startswith('fast'): trees = (PhyloTree(newick, False) for newick in self.trees) else: trees = self.trees args = task_interface.scrape_args(trees, normalise) logger.debug('{}'.format(args)) msg = task_interface.name array = jobhandler(task_interface.get_task(), args, msg, batchsize) return DistanceMatrix.from_array(squareform(array), self.names)
def phylotree(self): """ Gets the c++ PhyloTree object corresponding to this tree. Should be canonically the same - we set a _dirty flag if the Python version of the tree has changed since construction. If the flag is set then we reconstruct the c++ PhyloTree :return: PhyloTree instance """ if not self._phylotree or self._dirty: self._phylotree = PhyloTree(self.newick, self.rooted) self._dirty = False return self._phylotree
def get_inter_tree_distances(self, metric, jobhandler=default_jobhandler, normalise=False, min_overlap=4, overlap_fail_value=0, batchsize=1, show_progress=True): """ Generate a distance matrix from a fully-populated Collection. Can silence progressbars with show_progress=False option :param metric: str. Tree distance metric to use. Choice of 'euc', 'geo', 'rf', 'wrf'. :param jobhandler: treeCl.Jobhandler. Choice of SequentialJobHandler, ThreadpoolJobHandler, or ProcesspoolJobHandler. :param normalise: Bool. Whether to normalise the tree distance to the size of the leaf set. :param min_overlap: int. Trees with fewer leaves in common than this threshold will not have their distance calculated, but instead the distance returned will be the value in `overlap_fail_value`. :param overlap_fail_value: Any. The distance between trees with fewer leaves in common than `min_overlap` is set to this value. :param batchsize: int. Number of jobs to process in a batch when using a ProcesspoolJobHandler or a ThreadpoolJobHandler. :return: treeCl.DistanceMatrix. """ metrics = {'euc': tasks.EuclideanTreeDistance, 'geo': tasks.GeodesicTreeDistance, 'rf': tasks.RobinsonFouldsTreeDistance, 'wrf': tasks.WeightedRobinsonFouldsTreeDistance, 'fasteuc': tasks.EqualLeafSetEuclideanTreeDistance, 'fastgeo': tasks.EqualLeafSetGeodesicTreeDistance, 'fastrf': tasks.EqualLeafSetRobinsonFouldsTreeDistance, 'fastwrf': tasks.EqualLeafSetWeightedRobinsonFouldsTreeDistance} optioncheck(metric, list(metrics.keys())) task_interface = metrics[metric]() if metric.startswith('fast'): trees = (PhyloTree(newick, False) for newick in self.trees) else: trees = self.trees args = task_interface.scrape_args(trees, normalise, min_overlap, overlap_fail_value) logger.debug('{}'.format(args)) msg = task_interface.name if show_progress else '' array = jobhandler(task_interface.get_task(), args, msg, batchsize, nargs=binom_coeff(len(trees))) return DistanceMatrix.from_array(squareform(array), self.names)