Пример #1
0
def calculate_linkage(data, metric, method, between='cols'):
    assert between in BETWEENS

    if between == 'cols':
        data = data.T

    D = pdist(data, metric)
    try:
        from fastcluster import linkage
    except ImportError:
        from scipy.cluster.hierarchy import linkage
        warnings.warn("'fastcluster' not installed! This may take a while ... "
                      "Install with 'pip install fastcluster'")

    Z = linkage(D, method)

    try:
        from polo import optimal_leaf_ordering
        optimal_Z = optimal_leaf_ordering(Z, D)
    except ImportError:
        warnings.warn("'polo' not installed! Dendrogram will not be optimal "
                      "leaf ordered. Install with 'pip install polo'")
        optimal_Z = Z

    return optimal_Z
Пример #2
0
def run_polo(Z, D):
    from polo import optimal_leaf_ordering

    start_time = time.time()
    best_Z = optimal_leaf_ordering(Z, D)
    end_time = time.time()
    return end_time - start_time, best_Z
Пример #3
0
def optimal_linkage(data, rows=True, method='ward', metric='euclidean'):
    if not rows:
        data = data.T

    distance = fastcluster.pdist(data, metric=metric)
    linkage = fastcluster.linkage(distance, method=method)
    optimal_linkage = polo.optimal_leaf_ordering(linkage, distance)
    return optimal_linkage
def AP_order_incluster(dist, clusters, method='single', metric='correlation'):

    ordered = []

    #iterate over clusters

    for cl in return_unique(clusters):

        c_sel = clusters[clusters == cl].index
        D = pdist(dist.loc[c_sel, c_sel])
        Z = linkage(D, method=method)
        optimal_Z = optimal_leaf_ordering(Z, D)
        leaves = sch.dendrogram(optimal_Z, no_plot=True)['leaves']
        ordered += list(c_sel[leaves])

    return clusters[ordered]
Пример #5
0
def hc_order(g, metric='cityblock', method='ward', use_olo=True):
    """
    Basic hierarchical clustering to determine an order of contigs, using optimal leaf ordering (poor time complexity)
    to adjust tips.
    :param g: the graph to order
    :param metric: any
    :param method: ward or complete
    :param use_olo: use optimal leaf ordering
    :return: an ordering
    """

    d = pdist(nx.adjacency_matrix(g).todense(), metric=metric)
    if method == 'ward':
        z = ward(d)
    elif method == 'complete':
        z = complete(d)
    else:
        raise RuntimeError('unsupported method: {}'.format(method))

    if use_olo:
        z = polo.optimal_leaf_ordering(z, d)

    return np.array(dendrogram(z, no_plot=True)['leaves'])
Пример #6
0
def plot_heatmap(prds=None, hlas=None, ndigit=2):
    import xarray as xr

    if prds is None:
        prds = load_predictions(fmt='xarray')

    if ndigit not in (2, 4):
        raise ValueError('HLA ndigit must be 2 or 4')

    prds = filter_ndigit(prds, ndigit)

    # Calculate distance and linkage by hand for optimal ordering
    from scipy.spatial.distance import pdist
    from scipy.cluster.hierarchy import linkage
    from polo import optimal_leaf_ordering
    method = 'average'
    metric = 'euclidean'

    gs = {}
    for i, label in enumerate(prds.coords['probability'].values):
        if isinstance(prds, xr.DataArray):
            df = pd.DataFrame(
                    data=prds[:, :, i].values,
                    index=prds.indexes['hla'],
                    columns=prds.indexes['pid']
                    )

        Y_sample = pdist(df.values.T, metric=metric)
        Z_sample = linkage(Y_sample, method=method)
        Z_optimal_sample = optimal_leaf_ordering(Z_sample, Y_sample)
        Y_hla = pdist(df.values, metric=metric)
        Z_hla = linkage(Y_hla, method=method)
        Z_optimal_hla = optimal_leaf_ordering(Z_hla, Y_hla)

        pnames = sorted(
                set([tmp.split('_')[0] for tmp in df.columns]),
                key=lambda x: int(x[1:])
                )
        cols = sns.color_palette("husl", len(pnames))

        pnames_all = [tmp.split('_')[0] for tmp in df.columns]
        cols_all = [cols[pnames.index(pn)] for pn in pnames_all]

        g = sns.clustermap(
                df,
                row_linkage=Z_optimal_hla,
                col_linkage=Z_optimal_sample,
                col_colors=cols_all,
                vmin=0,
                vmax=1,
                )

        ax = g.ax_heatmap

        # There is a bug in the heatmap algorithm??
        ax.set_yticks(0.5 + np.arange(df.shape[0]))
        ax.set_xticks(0.5 + np.arange(df.shape[1]))
        ax.set_yticklabels(df.index[g.dendrogram_row.reordered_ind])
        ax.set_xticklabels(df.columns[g.dendrogram_col.reordered_ind])

        plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=0)
        ax.set_xlabel('patient-time')
        g.ax_col_colors.yaxis.set_label_position('right')
        g.ax_col_colors.set_ylabel('Patient', rotation=0, ha='left')

        g.fig.suptitle(label.capitalize())
        plt.subplots_adjust(left=0.02, top=0.95)

        # Add rectangles on the sample HLAs
        if label == 'posterior':
            from matplotlib.patches import Rectangle

            if hlas is None:
                hlas = load_HLA('data/HLA_types.csv')

            xlabels = [tk.get_text() for tk in ax.get_xticklabels()]
            ylabels = [tk.get_text() for tk in ax.get_yticklabels()]
            cols = {'A': 'red', 'B': 'green', 'C': 'blue'}

            for pname, tmp in hlas.items():
                for abc, tmp1 in tmp.items():
                    for s in tmp1:
                        sdig = abc+s[:2]
                        if ndigit == 4:
                            sdig += s[3:5]

                        j = None
                        for row in ylabels:
                            if row == sdig:
                                break
                        else:
                            continue
                        j = ylabels.index(row)

                        for i, samplename in enumerate(xlabels):
                            sample_pname = samplename.split('_')[0]
                            if pname != sample_pname:
                                continue

                            ax.add_patch(Rectangle(
                                (i, j), 1, 1,
                                facecolor='none', edgecolor=cols[abc], lw=1,
                                ))

        gs[label] = g

    return {'gs': gs, 'predictions': predictions}
Пример #7
0
    def hierarchical(self,
                     axis,
                     phenotypes=(),
                     metric='correlation',
                     method='average',
                     log_features=True,
                     optimal_ordering=False):
        '''Hierarchical clustering.

        Args:
            axis (string): It must be 'samples' or 'features'. \
                    The Dataset.counts matrix is used and \
                    either samples or features are clustered.
            phenotypes (iterable of strings): Phenotypes to add to the \
                    features for joint clustering.
            metric (string): Metric to calculate the distance matrix. Should \
                    be a string accepted by scipy.spatial.distance.pdist.
            method (string): Clustering method. Must be a string accepted by \
                    scipy.cluster.hierarchy.linkage.
            log_features (bool): Whether to add pseudocounts and take a log \
                    of the feature counts before calculating distances.
            optimal_ordering (bool): Whether to resort the linkage so that \
                    nearest neighbours have shortest distance. This may take \
                    longer than the clustering itself.
        Returns:
            dict with the linkage, distance matrix, and ordering.
        '''
        from scipy.spatial.distance import pdist
        from scipy.cluster.hierarchy import linkage, leaves_list

        if optimal_ordering:
            try:
                from polo import optimal_leaf_ordering
            except ImportError:
                raise ImportError(
                    'The package "polo" is needed for optimal leaf ordering')

        data = self.dataset.counts

        if log_features:
            data = np.log10(self.dataset.counts.pseudocount + data)

        if phenotypes is not None:
            data = data.copy()
            for pheno in phenotypes:
                data.loc[pheno] = self.dataset.samplesheet.loc[:, pheno]

        if axis == 'samples':
            data = data.T
        elif axis == 'features':
            pass
        else:
            raise ValueError('axis must be "samples" or "features"')

        Y = pdist(data.values, metric=metric)

        # Some metrics (e.g. correlation) give nan whenever the matrix has no
        # variation, default this to zero distance (e.g. two features that are
        # both total dropouts.
        Y = np.nan_to_num(Y)

        Z = linkage(Y, method=method)

        if optimal_ordering:
            Z = optimal_leaf_ordering(Z, Y)

        ids = data.index[leaves_list(Z)]

        return {
            'distance': Y,
            'linkage': Z,
            'leaves': ids,
        }
Пример #8
0
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters by mean")
        cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):
                    trinaries = cg.Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = cg.Trinarizer(f=self.f).fit(ds)
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            (markers, enrichment,
             qvals) = cg.MarkerSelection(self.n_markers).fit(ds)
            dsout.layers["enrichment"] = enrichment
            dsout.layers["enrichment_q"] = qvals

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")
            if "_Selected" in ds.ra:
                genes = (ds.ra._Selected == 1)
            else:
                logging.info("Normalization")
                normalizer = cg.Normalizer(False)
                normalizer.fit(ds)
                logging.info("Selecting up to 1000 genes")
                genes = cg.FeatureSelection(1000).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

            data = np.log(dsout[:, :] + 1)[genes, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward')
            optimal_Z = optimal_leaf_ordering(Z, D)
            ordering = hc.leaves_list(optimal_Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            data = trinaries[:, ordering][gene_order, :][:self.n_markers *
                                                         n_labels, :].T
            cluster_scores = []
            for ix in range(n_labels):
                cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum())
            dsout.ca.ClusterScore = np.array(cluster_scores)
Пример #9
0
def run_hclust(outname, meds, bins, step_size, tick_spc, use_polo=True, save_plot=False):
    """
    Cluster and plot the binned data matrix. This calls optimal leaf ordering
    algorithm (polo) by default, which has significant time-complexity.

    :param outname: pdf output file
    :param meds: median values
    :param bins: bins for medians
    :param step_size: size of a step
    :param tick_spc: tick spacing in "# bins"
    :param olo: reorder tips of tree with polo
    :param savePlot: save plot to file, otherwise plot to screen
    """

    # just for indexing
    names = meds.columns

    # normalise rows by their median value
    D = meds.as_matrix().T
    D = (D.T/np.median(D, 1)).T
    # center rows on their medians
    D = (D.T - np.median(D, 1)).T

    # clustering options. We will use correlation as measure of distance
    # and complete linkage clustering
    metric = 'correlation'
    method = 'complete'

    # calculate
    Y = linkage(D, method=method, metric=metric)

    # additionally, find optimal leaf ordering
    if use_polo:
        import polo
        print('\tcalculating optimal leaf ordering...')
        Y = polo.optimal_leaf_ordering(Y, pdist(D, metric=metric))

    # now we do some plotting
    fig = plt.figure(figsize=(12, 0.25*len(meds.columns)), dpi=150)
    gs = gridspec.GridSpec(2, 2, width_ratios=[5, 2], height_ratios= [0.6,10])
    gs.update(wspace=0.08)

    axmatrix = plt.subplot(gs[2])
    axmatrix.set_xlabel('genomic coord (kbp)')
    axcolor = plt.subplot(gs[0])
    axcolor.set_title('median centered relative abundance ')
    axdend = plt.subplot(gs[3])
    axdend.set_xlabel('dist')

    # calculate and plot the dendrogram
    Z = dendrogram(Y, ax=axdend, orientation='right', no_labels=True, color_threshold=0 )

    # the tips (leaves) of the tree become the order for rows
    idx = Z['leaves']

    # reorder rows
    D = D[idx, :]

    # the largest value in the matrix will set the upper and lower bounds
    # for the heatmap color-range. This assures 0 as the center.
    vmin = D.min() * 1.05
    vmax = -vmin

    # plot the matrix, 5% extra bounds above and below on colour range
    im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap='RdBu_r',
                          norm=colors.Normalize(vmin=vmin, vmax=vmax))

    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    # try and get some useful axis labels
    #xticks = np.linspace(0, max(bins) - max(bins) % step_size, 5)
    #print xticks

    print '\tticks will be every {}x{} = {} bp'.format(step_size, tick_spc, tick_spc*step_size)
    xticks = np.arange(0, len(bins), tick_spc) # every 100 bins
    axmatrix.set_xticks(xticks)
    axmatrix.set_xticklabels(xticks * step_size/1000)
    axmatrix.set_yticks(range(D.shape[0]))
    axmatrix.set_yticklabels(np.array(names)[idx], minor=False, )
    axmatrix.xaxis.set_label_position('bottom')
    axmatrix.xaxis.tick_bottom()

    # Plot colorbar.
    fig.colorbar(im, cax=axcolor, orientation='horizontal')

    if save_plot:
        plt.savefig('{}_hclust.pdf'.format(outname), bbox_inches='tight')
        pd.DataFrame(D.T, columns=names).to_csv('{}_dat.csv'.format(outname))
    else:
        plt.show()

    plt.close()