def make_modules(dist, min_dist, obs_ids): # create linkage matrix using complete linkage z = complete(dist) # make tree from linkage matrix with names from dist tree = TreeNode.from_linkage_matrix(z, obs_ids) # get all tips so in the end we can check if we are done all_tips = len([i for i in tree.postorder() if i.is_tip()]) modules = set() seen = set() dist = pd.DataFrame(squareform(dist), index=obs_ids, columns=obs_ids) for node in tree.levelorder(): if node.is_tip(): seen.add(node.name) else: tip_names = frozenset( (i.name for i in node.postorder() if i.is_tip())) if tip_names.issubset(seen): continue dists = (dist.loc[tip1, tip2] > min_dist for tip1, tip2 in combinations(tip_names, 2)) if any(dists): continue else: modules.add(tip_names) seen.update(tip_names) if len(seen) == all_tips: modules = sorted(modules, key=len, reverse=True) return modules raise ValueError("Well, how did I get here?")
def main_calc_tree_distance(lang_set_mat, dist_metric="rfd"): """Calculate Tree Distance.""" pred_linkage = get_linkage_matrix(lang_set_mat) pred_tree = TreeNode.from_linkage_matrix(pred_linkage, INDO_EURO_LANG_NAMES) pred_tree_string_io = StringIO() pred_tree.write(pred_tree_string_io) pred_tree_string = pred_tree_string_io.getvalue() # Replace distances with 1 unweighted_tree_string = re.sub(r"\d+\.\d+", "1", pred_tree_string) pred_tree = TreeNode.read(StringIO(unweighted_tree_string)) if dist_metric == "rfd": tree_dist = pred_tree.compare_rfd(GT_INDO_EUROPEAN_TREE) else: gt_distances_struct = GT_INDO_EUROPEAN_TREE.tip_tip_distances() gt_distances = gt_distances_struct.data gt_ids = gt_distances_struct.ids pred_distances = pred_tree.tip_tip_distances( endpoints=list(gt_ids)).data tree_dist = np.sum((gt_distances - pred_distances)**2) return tree_dist, pred_tree
def write_tree(): dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() triu = np.square(dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def get_clusters(x_original, axis='row'): """Performs UPGMA clustering using euclidean distances""" x = x_original.copy() if axis == 'column': x = x.T nr = x.shape[0] row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean') # do upgma - rows # Average in SciPy's cluster.hierarchy.linkage is UPGMA linkage_matrix = linkage(row_dissims.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids) return [int(tip.name) for tip in tree.tips()]
def get_clusters(x_original, axis='row'): """Performs UPGMA clustering using euclidean distances""" x = x_original.copy() if axis == 'column': x = x.T nr = x.shape[0] row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean') # do upgma - rows # Average in SciPy's cluster.hierarchy.linkage is UPGMA linkage_matrix = linkage(row_dissims.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids) return [int(tip.name) for tip in tree.tips()]
def get_clusters(x_original, axis=['row', 'column'][0]): """Performs UPGMA clustering using euclidean distances""" x = x_original.copy() if axis == 'column': x = x.T nr = x.shape[0] metric_f = get_nonphylogenetic_metric('euclidean') row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr))) # do upgma - rows # Average in SciPy's cluster.heirarchy.linkage is UPGMA linkage_matrix = linkage(row_dissims.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids) row_order = [int(tip.name) for tip in tree.tips()] return row_order
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def get_tree(self): from ete3.coretype.tree import TreeError import numpy as np from skbio.tree import TreeNode from scipy.cluster.hierarchy import weighted ids = self.dmx.index.tolist() triu = np.triu(self.dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") self.tree = Tree(nw) try: # midpoint root tree self.tree.set_outgroup(self.tree.get_midpoint_outgroup()) except TreeError: self.log.error("Unable to midpoint root tree") self.tree.write(outfile=self.nw_path)
def make_modules_naive(correls, min_r=None, max_p=None, prefix="module"): # read in correlations file and make distance matrix if min_r is not None: min_dist = cor_to_dist(min_r) cor, labels = correls_to_cor(correls) dist = cor_to_dist(cor) elif max_p is not None: # TODO: This raise NotImplementedError( 'Making modules based on a p-value is not currently supported') else: raise ValueError("this is prevented above") # create linkage matrix using complete linkage z = complete(dist) # make tree from linkage matrix with names from dist tree = TreeNode.from_linkage_matrix(z, labels) # get all tips so in the end we can check if we are done all_tips = tree.count(tips=True) modules = set() seen = set() dist = pd.DataFrame(squareform(dist), index=labels, columns=labels) for node in tree.levelorder(): if node.is_tip(): seen.add(node.name) else: tip_names = frozenset((i.name for i in node.tips())) if tip_names.issubset(seen): continue dists = (dist.loc[tip1, tip2] > min_dist for tip1, tip2 in combinations(tip_names, 2)) if any(dists): continue else: modules.add(tip_names) seen.update(tip_names) if len(seen) == all_tips: modules = { '%s_%s' % (prefix, i): otus for i, otus in enumerate(sorted(modules, key=len, reverse=True)) } return modules raise ValueError("Well, how did I get here?")
def write_tree(cluster_method): import scipy.spatial.distance as ssd dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t") ids = dmx.index.tolist() #triu = np.square(dmx.as_matrix()) triu = np.square(dmx.values) distArray = ssd.squareform(triu) if cluster_method == "average": hclust = average(distArray) elif cluster_method == "weighted": hclust = weighted(distArray) else: print("invalid cluster method chosen") sys.exit() t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") outfile = open("bsr_matrix.tree", "w") outfile.write(nw) outfile.close()
def single_file_upgma(input_file, output_file): # read in dist matrix dist_mat = DistanceMatrix.read(input_file) # SciPy uses average as UPGMA: # http://docs.scipy.org/doc/scipy/reference/generated/ # scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage linkage_matrix = linkage(dist_mat.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids) # write output f = open(output_file, 'w') try: f.write(tree.to_newick(with_distances=True)) except AttributeError: if c is None: raise RuntimeError("""input file %s did not make a UPGMA tree. Ensure it has more than one sample present""" % (str(input_file),)) raise f.close()
def single_file_upgma(input_file, output_file): # read in dist matrix dist_mat = DistanceMatrix.read(input_file) # SciPy uses average as UPGMA: # http://docs.scipy.org/doc/scipy/reference/generated/ # scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage linkage_matrix = linkage(dist_mat.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids) # write output f = open(output_file, 'w') try: f.write(tree.to_newick(with_distances=True)) except AttributeError: if c is None: raise RuntimeError("""input file %s did not make a UPGMA tree. Ensure it has more than one sample present""" % (str(input_file), )) raise f.close()
def get_tree(self): # Use decorator instead of if statement if self.tree_complete is False: from ete3.coretype.tree import TreeError import numpy as np # import matplotlib as mpl # mpl.use('TkAgg') from skbio.tree import TreeNode from scipy.cluster.hierarchy import weighted ids = ['{}.fasta'.format(i) for i in self.dmx.index.tolist()] triu = np.triu(self.dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") self.tree = Tree(nw) # midpoint root tree try: self.tree.set_outgroup(self.tree.get_midpoint_outgroup()) except TreeError as e: self.log.exception() self.tree.write(outfile=self.nw_path)