예제 #1
0
def make_modules(dist, min_dist, obs_ids):
    # create linkage matrix using complete linkage
    z = complete(dist)
    # make tree from linkage matrix with names from dist
    tree = TreeNode.from_linkage_matrix(z, obs_ids)
    # get all tips so in the end we can check if we are done
    all_tips = len([i for i in tree.postorder() if i.is_tip()])
    modules = set()
    seen = set()
    dist = pd.DataFrame(squareform(dist), index=obs_ids, columns=obs_ids)
    for node in tree.levelorder():
        if node.is_tip():
            seen.add(node.name)
        else:
            tip_names = frozenset(
                (i.name for i in node.postorder() if i.is_tip()))
            if tip_names.issubset(seen):
                continue
            dists = (dist.loc[tip1, tip2] > min_dist
                     for tip1, tip2 in combinations(tip_names, 2))
            if any(dists):
                continue
            else:
                modules.add(tip_names)
                seen.update(tip_names)
        if len(seen) == all_tips:
            modules = sorted(modules, key=len, reverse=True)
            return modules
    raise ValueError("Well, how did I get here?")
예제 #2
0
def main_calc_tree_distance(lang_set_mat, dist_metric="rfd"):
    """Calculate Tree Distance."""
    pred_linkage = get_linkage_matrix(lang_set_mat)
    pred_tree = TreeNode.from_linkage_matrix(pred_linkage,
                                             INDO_EURO_LANG_NAMES)

    pred_tree_string_io = StringIO()
    pred_tree.write(pred_tree_string_io)
    pred_tree_string = pred_tree_string_io.getvalue()

    # Replace distances with 1
    unweighted_tree_string = re.sub(r"\d+\.\d+", "1", pred_tree_string)
    pred_tree = TreeNode.read(StringIO(unweighted_tree_string))

    if dist_metric == "rfd":
        tree_dist = pred_tree.compare_rfd(GT_INDO_EUROPEAN_TREE)
    else:
        gt_distances_struct = GT_INDO_EUROPEAN_TREE.tip_tip_distances()
        gt_distances = gt_distances_struct.data
        gt_ids = gt_distances_struct.ids

        pred_distances = pred_tree.tip_tip_distances(
            endpoints=list(gt_ids)).data
        tree_dist = np.sum((gt_distances - pred_distances)**2)

    return tree_dist, pred_tree
예제 #3
0
def write_tree():
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    triu = np.square(dmx.as_matrix())
    hclust = weighted(triu)
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
예제 #4
0
def get_clusters(x_original, axis='row'):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean')
    # do upgma - rows
    # Average in SciPy's cluster.hierarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    return [int(tip.name) for tip in tree.tips()]
예제 #5
0
def get_clusters(x_original, axis='row'):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    row_dissims = pw_distances(x, ids=map(str, range(nr)), metric='euclidean')
    # do upgma - rows
    # Average in SciPy's cluster.hierarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    return [int(tip.name) for tip in tree.tips()]
예제 #6
0
def get_clusters(x_original, axis=['row', 'column'][0]):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    metric_f = get_nonphylogenetic_metric('euclidean')
    row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr)))
    # do upgma - rows
    # Average in SciPy's cluster.heirarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    row_order = [int(tip.name) for tip in tree.tips()]
    return row_order
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
예제 #8
0
    def get_tree(self):
        from ete3.coretype.tree import TreeError
        import numpy as np
        from skbio.tree import TreeNode
        from scipy.cluster.hierarchy import weighted

        ids = self.dmx.index.tolist()
        triu = np.triu(self.dmx.as_matrix())
        hclust = weighted(triu)
        t = TreeNode.from_linkage_matrix(hclust, ids)
        nw = t.__str__().replace("'", "")
        self.tree = Tree(nw)
        try:
            # midpoint root tree
            self.tree.set_outgroup(self.tree.get_midpoint_outgroup())
        except TreeError:
            self.log.error("Unable to midpoint root tree")
        self.tree.write(outfile=self.nw_path)
예제 #9
0
def make_modules_naive(correls, min_r=None, max_p=None, prefix="module"):
    # read in correlations file and make distance matrix
    if min_r is not None:
        min_dist = cor_to_dist(min_r)
        cor, labels = correls_to_cor(correls)
        dist = cor_to_dist(cor)
    elif max_p is not None:
        # TODO: This
        raise NotImplementedError(
            'Making modules based on a p-value is not currently supported')
    else:
        raise ValueError("this is prevented above")
    # create linkage matrix using complete linkage
    z = complete(dist)
    # make tree from linkage matrix with names from dist
    tree = TreeNode.from_linkage_matrix(z, labels)
    # get all tips so in the end we can check if we are done
    all_tips = tree.count(tips=True)
    modules = set()
    seen = set()
    dist = pd.DataFrame(squareform(dist), index=labels, columns=labels)
    for node in tree.levelorder():
        if node.is_tip():
            seen.add(node.name)
        else:
            tip_names = frozenset((i.name for i in node.tips()))
            if tip_names.issubset(seen):
                continue
            dists = (dist.loc[tip1, tip2] > min_dist
                     for tip1, tip2 in combinations(tip_names, 2))
            if any(dists):
                continue
            else:
                modules.add(tip_names)
                seen.update(tip_names)
        if len(seen) == all_tips:
            modules = {
                '%s_%s' % (prefix, i): otus
                for i, otus in enumerate(sorted(modules, key=len,
                                                reverse=True))
            }
            return modules
    raise ValueError("Well, how did I get here?")
예제 #10
0
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    #triu = np.square(dmx.as_matrix())
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
예제 #11
0
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file),))
        raise
    f.close()
예제 #12
0
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file), ))
        raise
    f.close()
예제 #13
0
 def get_tree(self):
     # Use decorator instead of if statement
     if self.tree_complete is False:
         from ete3.coretype.tree import TreeError
         import numpy as np
         # import matplotlib as mpl
         # mpl.use('TkAgg')
         from skbio.tree import TreeNode
         from scipy.cluster.hierarchy import weighted
         ids = ['{}.fasta'.format(i) for i in self.dmx.index.tolist()]
         triu = np.triu(self.dmx.as_matrix())
         hclust = weighted(triu)
         t = TreeNode.from_linkage_matrix(hclust, ids)
         nw = t.__str__().replace("'", "")
         self.tree = Tree(nw)
         # midpoint root tree
         try:
             self.tree.set_outgroup(self.tree.get_midpoint_outgroup())
         except TreeError as e:
             self.log.exception()
         self.tree.write(outfile=self.nw_path)