def add_stars(ax, P_mat, tri=True):
    '''
    Use the p matrix to add stars to the significant cells.

    If triangle is True then only put stars in the lower triangle, otherwise
    put them in all the cells
    '''
    # Import what you need
    import numpy as np
    # Get the indices you need
    if tri:
        i_inds, j_inds = np.triu_indices_from(P_mat, k=0)
    else:
        i_inds, j_inds = np.triu_indices_from(P_mat, k=P_mat.shape[0]*-1)
    
    # Loop through all the measures and fill the arrays
    for i, j in zip(i_inds, j_inds):

        # Figure out the text you're going to put on the plot
        star = ''
        if 0.01 < P_mat[i,j] < 0.05:
            star = '*'
        elif 0.001 <= P_mat[i,j] < 0.01:
            star = '**'
        elif P_mat[i,j] < 0.001:
            star = '***'

        text = ax.text(i, j, star,
            horizontalalignment='center',
            verticalalignment='center',
            color = 'k')

    return ax
def _expected_kid_and_std(real_imgs, gen_imgs, max_block_size=1024):
  n_r, dim = real_imgs.shape
  n_g = gen_imgs.shape[0]

  n_blocks = int(np.ceil(max(n_r, n_g) / max_block_size))

  sizes_r = np.full(n_blocks, n_r // n_blocks)
  to_patch = n_r - n_blocks * (n_r // n_blocks)
  if to_patch > 0:
    sizes_r[-to_patch:] += 1
  inds_r = np.r_[0, np.cumsum(sizes_r)]
  assert inds_r[-1] == n_r

  sizes_g = np.full(n_blocks, n_g // n_blocks)
  to_patch = n_g - n_blocks * (n_g // n_blocks)
  if to_patch > 0:
    sizes_g[-to_patch:] += 1
  inds_g = np.r_[0, np.cumsum(sizes_g)]
  assert inds_g[-1] == n_g

  ests = []
  for i in range(n_blocks):
    r = real_imgs[inds_r[i]:inds_r[i + 1]]
    g = gen_imgs[inds_g[i]:inds_g[i + 1]]

    k_rr = (np.dot(r, r.T) / dim + 1)**3
    k_rg = (np.dot(r, g.T) / dim + 1)**3
    k_gg = (np.dot(g, g.T) / dim + 1)**3
    ests.append(-2 * k_rg.mean() +
                k_rr[np.triu_indices_from(k_rr, k=1)].mean() +
                k_gg[np.triu_indices_from(k_gg, k=1)].mean())

  var = np.var(ests, ddof=1) if len(ests) > 1 else np.nan
  return np.mean(ests), np.sqrt(var / len(ests))
Пример #3
0
        def get_measurement_polynomials(self, noise = 0, seed = 0):
            np.random.seed(seed)

            k, d = self.k, self.d
            params = self.get_parameters()
            R = ring([x for x, _ in params], RR)[0]
            names = {str(x) : R(x) for x in R.symbols}
            xs = array([[names[self.x(i,j)] for j in xrange(k)] for i in xrange(d)])
            params = [(names[x], v) for x, v in params]

            # Second order moments (TODO: 3rd order moments)
            P = zeros((d,d), dtype=np.object)
            p = zeros((d,), dtype=np.object)
            for i in xrange(d):
                p[i] = sum(xs[i,k_] for k_ in xrange(k))# / k
                for j in xrange(i, d):
                    P[i,j] = sum(xs[i,k_] * xs[j,k_] for k_ in xrange(k))# / k

            # Project and profit
            m = zeros((d,))
            M = zeros((d,d))
            for i in xrange(d):
                m[i] = p[i].evaluate(params)
                for j in xrange(i, d):
                    M[i,j] = P[i,j].evaluate(params)
            M = M + noise * np.random.randn(d,d)
            m = m + noise * np.random.randn(d)
            # TODO: Something is wrong here 
            #m = M.sum(1)

            # Finally return values.
            return R, [f - f_ 
                    for f, f_ in zip(p.flatten(), m.flatten())] + [f - f_ 
                            for f, f_ in zip(P[triu_indices_from(P)], M[triu_indices_from(M)])]
def plot_clustering_similarity(results, plot_dir=None, verbose=False, ext='png'):  
    HCA = results.HCA
    # get all clustering solutions
    clusterings = HCA.results.items()
    # plot cluster agreement across embedding spaces
    names = [k for k,v in clusterings]
    cluster_similarity = np.zeros((len(clusterings), len(clusterings)))
    cluster_similarity = pd.DataFrame(cluster_similarity, 
                                     index=names,
                                     columns=names)
    
    distance_similarity = np.zeros((len(clusterings), len(clusterings)))
    distance_similarity = pd.DataFrame(distance_similarity, 
                                     index=names,
                                     columns=names)
    for clustering1, clustering2 in combinations(clusterings, 2):
        name1 = clustering1[0].split('-')[-1]
        name2 = clustering2[0].split('-')[-1]
        # record similarity of distance_df
        dist_corr = np.corrcoef(squareform(clustering1[1]['distance_df']),
                                squareform(clustering2[1]['distance_df']))[1,0]
        distance_similarity.loc[name1, name2] = dist_corr
        distance_similarity.loc[name2, name1] = dist_corr
        # record similarity of clustering of dendrogram
        clusters1 = clustering1[1]['labels']
        clusters2 = clustering2[1]['labels']
        rand_score = adjusted_rand_score(clusters1, clusters2)
        MI_score = adjusted_mutual_info_score(clusters1, clusters2)
        cluster_similarity.loc[name1, name2] = rand_score
        cluster_similarity.loc[name2, name1] = MI_score
    
    with sns.plotting_context(context='notebook', font_scale=1.4):
        clust_fig = plt.figure(figsize = (12,12))
        sns.heatmap(cluster_similarity, square=True)
        plt.title('Cluster Similarity: TRIL: Adjusted MI, TRIU: Adjusted Rand',
                  y=1.02)
        
        dist_fig = plt.figure(figsize = (12,12))
        sns.heatmap(distance_similarity, square=True)
        plt.title('Distance Similarity, metric: %s' % HCA.dist_metric,
                  y=1.02)
        
    if plot_dir is not None:
        save_figure(clust_fig, path.join(plot_dir, 
                                   'cluster_similarity_across_measures.%s' % ext),
                    {'bbox_inches': 'tight'})
        save_figure(dist_fig, path.join(plot_dir, 
                                   'distance_similarity_across_measures.%s' % ext),
                    {'bbox_inches': 'tight'})
        plt.close(clust_fig)
        plt.close(dist_fig)
    
    if verbose:
        # assess relationship between two measurements
        rand_scores = cluster_similarity.values[np.triu_indices_from(cluster_similarity, k=1)]
        MI_scores = cluster_similarity.T.values[np.triu_indices_from(cluster_similarity, k=1)]
        score_consistency = np.corrcoef(rand_scores, MI_scores)[0,1]
        print('Correlation between measures of cluster consistency: %.2f' \
              % score_consistency)
Пример #5
0
def mat2vec(m,include_diag=False):
    # Hack to be compatible with matlab column-wise instead of row-wise
    if include_diag:
        inddown = np.triu_indices_from(m,0)
    else:
        inddown = np.triu_indices_from(m,1)

    inddown = (inddown[1], inddown[0])
    return m[inddown]
Пример #6
0
def test_simple_hessenberg_trafo():
    # Made up discrete time TF
    G = Transfer([1., -8., 28., -58., 67., -30.],
                 poly([1, 2, 3., 2, 3., 4, 1 + 1j, 1 - 1j]), dt=0.1)
    H, _ = hessenberg_realization(G, compute_T=1, form='c', invert=1)
    assert_(not np.any(H.a[triu_indices_from(H.a, k=2)]))
    assert_(not np.any(H.b[:-1, 0]))
    H = hessenberg_realization(G, form='o', invert=1)
    assert_(not np.any(H.c[0, :-1]))
    assert_(not np.any(H.a.T[triu_indices_from(H.a, k=2)]))
Пример #7
0
def corr(chroma):
    """Chroma correlation coefficient fingerprints.

    After [1].

    Args:
        chroma (2d-array): 2d-array containing the chroma features.

    Returns:
        list: 12 fingerprints (1d-array), one for each key.

    [1] Van Balen, J., Bountouridis, D., Wiering, F., & Veltkamp, R.C.
        (2014). Cognition-inspired Descriptors for Scalable Cover Song
        Retrieval. In Proc. International Society for Music Information
        Retrieval Conference.
    """
    fp = np.corrcoef(chroma, rowvar=0)

    fp_12 = [np.roll(np.roll(fp, i, 0), i, 1) for i in range(12)]

    # flatten
    upper = np.triu_indices_from(fp, k=1)
    fp_12 = [fp[upper] for fp in fp_12]

    return fp_12
Пример #8
0
def lapointe_random_uniform_ultrametric(n, prng=None):
    """Generate a uniform random ultrametric over n points using the method of 
    Lapointe."""
    if prng is None:
        prng = _np.random.RandomState()

    fusion_levels = prng.uniform(0,1,n-1)
    
    ultrametric = _np.zeros((n,n))
    
    current_diag_inds = _off_diagonal_indices(n,1)
    ultrametric[current_diag_inds] = fusion_levels

    for j in range(2,n):
        prev_diag_inds = current_diag_inds
        current_diag_inds = _off_diagonal_indices(n, j)

        prev_diag = ultrametric[prev_diag_inds]
        current_diag = _np.maximum(prev_diag[:-1], prev_diag[1:])

        ultrametric[current_diag_inds] = current_diag

    ultrametric = ultrametric + ultrametric.T

    i,j = _np.triu_indices_from(ultrametric)

    shuffle = prng.permutation(_np.arange(n))

    shuffled_ultrametric = _np.zeros_like(ultrametric)
    shuffled_ultrametric[i,j] = ultrametric[shuffle[i], shuffle[j]]

    return _distance.squareform(ultrametric, checks=False)
Пример #9
0
    def test_pairplot(self):

        vars = ["x", "y", "z"]
        g = pairplot(self.df)

        for ax in g.diag_axes:
            nt.assert_equal(len(ax.patches), 10)

        for i, j in zip(*np.triu_indices_from(g.axes, 1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

        for i, j in zip(*np.tril_indices_from(g.axes, -1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

        for i, j in zip(*np.diag_indices_from(g.axes)):
            ax = g.axes[i, j]
            nt.assert_equal(len(ax.collections), 0)

        plt.close("all")
Пример #10
0
    def test_map_diag_and_offdiag(self):

        vars = ["x", "y", "z"]
        g = ag.PairGrid(self.df)
        g.map_offdiag(plt.scatter)
        g.map_diag(plt.hist)

        for ax in g.diag_axes:
            nt.assert_equal(len(ax.patches), 10)

        for i, j in zip(*np.triu_indices_from(g.axes, 1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

        for i, j in zip(*np.tril_indices_from(g.axes, -1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

        for i, j in zip(*np.diag_indices_from(g.axes)):
            ax = g.axes[i, j]
            nt.assert_equal(len(ax.collections), 0)
def plot_corr(file, score, stat, ind_var, brain_type):

    # seaborn
    sns.set(style="white")

    # import the dataframe
    dt = pd.read_csv(file)

    # Compute the correlation matrix
    corr = dt.corr()

    ### Create the matrix figure with seaborn
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(len(ind_var),len(ind_var)))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, annot=False, ax=ax)
    plt.subplots_adjust(left= 0.30,bottom=0.30)
    plt.savefig(os.path.join(stat,score, "heatmap_" + score + "_" + stat + "_"+ brain_type + ".png"))
    plt.close()

    return corr
Пример #12
0
    def map_upper(self, func, **kwargs):
        """Plot with a bivariate function on the upper diagonal subplots.

        Parameters
        ----------
        func : callable plotting function
            Must take x, y arrays as positional arguments and draw onto the
            "currently active" matplotlib Axes.

        """
        kw_color = kwargs.pop("color", None)
        for i, j in zip(*np.triu_indices_from(self.axes, 1)):

            hue_grouped = self.data.groupby(self.hue_vals)
            for k, (label_k, data_k) in enumerate(hue_grouped):

                ax = self.axes[i, j]
                plt.sca(ax)

                x_var = self.x_vars[j]
                y_var = self.y_vars[i]

                color = self.palette[k] if kw_color is None else kw_color
                func(data_k[x_var], data_k[y_var], label=label_k,
                     color=color, **kwargs)

            self._clean_axis(ax)
            self._update_legend_data(ax)

        if kw_color is not None:
            kwargs["color"] = kw_color
Пример #13
0
  def unpad_randomize_and_flatten(self, cm):
    """
    1. Remove zero padding on Coulomb Matrix
    2. Randomly permute the rows and columns for n_samples
    3. Flatten each sample to upper triangular portion
    Returns list of feature vectors
    """
    max_atom_number = len(cm) 
    atom_number = 0
    for i in cm[0]:
        if atom_number == max_atom_number: break
        elif i != 0.: atom_number += 1
        else: break

    upcm = cm[0:atom_number,0:atom_number]

    row_norms = np.asarray(
        [np.linalg.norm(row) for row in upcm], dtype=float)
    rng = np.random.RandomState(self.seed)
    e = rng.normal(size=row_norms.size)
    p = np.argsort(row_norms+e)
    rcm = upcm[p][:,p]
    rcm = pad_array(rcm, len(cm))
    rcm = rcm[np.triu_indices_from(rcm)]

    return rcm
 def LML_se(self,theta,returnGradients=False):
     self.setTheta(theta)
     K,r = self.cov(self.X,retr=True)
     Ky = K.copy()
     Ky +=  np.eye(self.X.shape[0])*self.var_n + np.eye(self.X.shape[0])*1e-8
     L = self.cholSafe(Ky)
     WlogDet = 2.*np.sum(np.log(np.diag(L)))
     alpha, status = dpotrs(L, self.Y, lower=1)
     dataFit = - np.sum(alpha * self.Y)
     modelComplexity = -self.Y.shape[1] * WlogDet
     normalizer = -self.Y.size * log2pi
     logMarginalLikelihood = 0.5*(dataFit + modelComplexity + normalizer)
     if returnGradients == False:
         return logMarginalLikelihood
     else:
         Wi, status = dpotri(-L, lower=1)
         Wi = np.asarray(Wi)
         # copy bottom triangle to top triangle
         triu = np.triu_indices_from(Wi,k=1)
         Wi[triu] = Wi.T[triu]
         # dL = change in LML, dK is change in Kernel(K)
         dL_dK = 0.5 * (np.dot(alpha,alpha.T) - self.Y.shape[1] * Wi)
         dL_dVarn = np.diag(dL_dK).sum()
         varfGradient = np.sum(K* dL_dK)/self.var_f
         dK_dr = -r*K
         dL_dr = dK_dr * dL_dK
         lengthscaleGradient = -np.sum(dL_dr*r)/self.charLen
         grads = np.array([varfGradient, lengthscaleGradient, dL_dVarn])
         return logMarginalLikelihood, grads
Пример #15
0
def compute_PR_vectors(corr_mat, batches, verbosity):

    triu_rows, triu_cols = np.triu_indices_from(corr_mat, k = 1)

    corr_vec = corr_mat[triu_rows, triu_cols]

    batches_1 = batches[triu_rows]
    batches_2 = batches[triu_cols]
    batches_match = np.array(batches_1 == batches_2, dtype = np.int)
    # Get the number of true positives so we can use that as recall instead of
    # a fraction from 0 to 1.
    num_true_positives = np.sum(batches_match)

    if verbosity >= 1:
        print '\t\tnumber of NaN correlations: {}'.format(np.sum(np.isnan(corr_vec)))
    
    precision, recall, thresholds = precision_recall_curve(y_true = batches_match, probas_pred = corr_vec)

    # This calculates the "normalized" AUC, since the recall values have not been multiplied by the
    # number of true positives yet.
    norm_AUPR = auc(recall, precision)

    # Reverse the orders so that they can easily be printed to file
    # with the highest recalls first
    return precision[::-1], recall[::-1] * num_true_positives, thresholds[::-1], norm_AUPR
Пример #16
0
def get_candidate_taus_above_threshold(Ds, thresh, **kwargs):
    upper_tri_Ds = Ds[np.triu_indices_from(Ds, k=1)]
    if "nz_frac" in kwargs:
        nz_frac = float(kwargs["nz_frac"])
        common.print_log("Setting tau so that fraction of distances below threshold = {0}".format(nz_frac))

        all_taus = np.array(sorted(upper_tri_Ds))
        n_all_taus = len(all_taus)
        idx = min(max(int(nz_frac*n_all_taus), 0), n_all_taus-1)
        tau = all_taus[idx]
        if tau < thresh:
            common.print_log("Parameter tau was set below the minimum value which makes the graph connected. Changing it to {0}".format(thresh))
            tau = thresh
        candidate_taus = np.array([tau])
    else:
        grid_size = int(kwargs.get("grid_size", 20))

        linspace_tau = bool(kwargs.get("linspace_tau", False))
        if linspace_tau:
            candidate_taus = np.linspace(thresh, np.max(Ds[Ds > 0]), grid_size)
        else:
            all_taus = np.array(sorted(upper_tri_Ds[upper_tri_Ds > thresh]))
            n_all_taus = len(all_taus)
            tau_indices = np.asarray(np.concatenate([np.linspace(0, 1, grid_size)]) * (n_all_taus - 1), dtype=int)
            candidate_taus = sorted(all_taus[tau_indices])

    nz_fracs = [100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds) for tau in candidate_taus]

    common.print_log("Found {0} candidate thresholds:".format(len(candidate_taus)), candidate_taus)
    common.print_log("Percentage of distances below threshold:", nz_fracs)
    return candidate_taus
Пример #17
0
    def test_pairplot(self):

        vars = ["x", "y", "z"]
        g = ag.pairplot(self.df)

        for ax in g.diag_axes:
            assert len(ax.patches) > 1

        for i, j in zip(*np.triu_indices_from(g.axes, 1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

        for i, j in zip(*np.tril_indices_from(g.axes, -1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

        for i, j in zip(*np.diag_indices_from(g.axes)):
            ax = g.axes[i, j]
            nt.assert_equal(len(ax.collections), 0)

        g = ag.pairplot(self.df, hue="a")
        n = len(self.df.a.unique())

        for ax in g.diag_axes:
            assert len(ax.lines) == n
            assert len(ax.collections) == n
Пример #18
0
    def test_pairplot_reg(self):

        vars = ["x", "y", "z"]
        g = ag.pairplot(self.df, diag_kind="hist", kind="reg")

        for ax in g.diag_axes:
            nt.assert_equal(len(ax.patches), 10)

        for i, j in zip(*np.triu_indices_from(g.axes, 1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

            nt.assert_equal(len(ax.lines), 1)
            nt.assert_equal(len(ax.collections), 2)

        for i, j in zip(*np.tril_indices_from(g.axes, -1)):
            ax = g.axes[i, j]
            x_in = self.df[vars[j]]
            y_in = self.df[vars[i]]
            x_out, y_out = ax.collections[0].get_offsets().T
            npt.assert_array_equal(x_in, x_out)
            npt.assert_array_equal(y_in, y_out)

            nt.assert_equal(len(ax.lines), 1)
            nt.assert_equal(len(ax.collections), 2)

        for i, j in zip(*np.diag_indices_from(g.axes)):
            ax = g.axes[i, j]
            nt.assert_equal(len(ax.collections), 0)
def scatterplot_matrix(data, attNames, **kwargs):
    rows, atts = data.shape
    fig, axes = plt.subplots(nrows = atts, ncols =atts, figsize=(30,30))
    fig.subplots_adjust(hspace = 0.05 , wspace = 0.05)

    for ax in axes.flat:
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[y], data[x], **kwargs)

    # Label the diagonal subplots...
    for i, label in enumerate(attNames):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    for i, j in zip(range(atts), itertools.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)

    return fig
Пример #20
0
def plot_corr(df, size=10):
    """Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot"""
    import matplotlib.pyplot as plt
    from matplotlib import cm
    import numpy as np

    corr = df.corr()
    label = df.corr()
    mask = np.tri(corr.shape[0], k=-1)
    corr = np.ma.array(corr, mask=mask)
    mask[np.triu_indices_from(mask)] = True

    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    cmap = cm.get_cmap("jet", 10)
    cmap.set_bad("w")

    plt.xticks(range(len(label.columns)), label.columns, rotation=90)
    plt.yticks(range(len(label.columns)), label.columns)
    ax.imshow(corr, interpolation="nearest", cmap=cmap)
    plt.show()
def tangent_space(covmats, Cref):
    """Project a set of covariance matrices in the tangent space according to the given reference point Cref

    :param covmats: Covariance matrices set, Ntrials X Nchannels X Nchannels
    :param Cref: The reference covariance matrix
    :returns: the Tangent space , a matrix of Ntrials X (Nchannels*(Nchannels+1)/2)

    """
    Nt, Ne, Ne = covmats.shape
    Cm12 = invsqrtm(Cref)
    idx = numpy.triu_indices_from(Cref)
    T = numpy.empty((Nt, Ne * (Ne + 1) / 2))
    coeffs = (
        numpy.sqrt(2) *
        numpy.triu(
            numpy.ones(
                (Ne,
                 Ne)),
            1) +
        numpy.eye(Ne))[idx]
    for index in range(Nt):
        tmp = numpy.dot(numpy.dot(Cm12, covmats[index, :, :]), Cm12)
        tmp = logm(tmp)
        T[index, :] = numpy.multiply(coeffs, tmp[idx])
    return T
Пример #22
0
def scatterplot_matrix(data, names, **kwargs):
    """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
    against other rows, resulting in a nrows by nrows grid of subplots with the
    diagonal subplots labeled with "names".  Additional keyword arguments are
    passed on to matplotlib's "plot" command. Returns the matplotlib figure
    object containg the subplot grid."""
    numvars, numdata = data.shape
    fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the data.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[x], data[y], **kwargs)
Пример #23
0
def convert_file(in_file, out_file, factors=[.25, 1, 4]):
    with h5py.File(in_file, 'r') as inp:
        func_ks = [
            (df, k)
            for df, g in inp.iteritems() if df != '_meta'
            for k in g.iterkeys()
        ]

    meds = {}
    for df, k in func_ks:
        with h5py.File(in_file, 'r') as inp:
            divs = inp[df][k][()]

        if df in meds:
            med = meds[df]
        else:
            meds[df] = med = np.median(divs[np.triu_indices_from(divs)])

        for factor in factors:
            name = 'median * {}'.format(factor)
            print '/'.join((df, k, name))

            with h5py.File(out_file) as out:
                g = out.require_group(df).require_group(k)
                if name in g:
                    print '\talready there'
                    continue

            km = sdm.sdm.make_km(divs, med * factor)
            with h5py.File(out_file) as out:
                out[df][k][name] = km
def plot_2_corr_heatmaps(corr1, corr2, labels, title1, title2):
    fig=plt.figure(figsize=(9, 8))
    gs = gridspec.GridSpec(1, 2)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    
    sns.set(style="white")
    
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr1, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr1, mask=mask, cmap=cmap, vmax=.3,
                square=True, xticklabels=labels, yticklabels=labels,
                linewidths=.5, ax=ax1, cbar_kws={"shrink": .3}, annot=True)
    ax1.set_title(title1)
    sns.heatmap(corr2, mask=mask, cmap=cmap, vmax=.3,
                square=True, xticklabels=labels, yticklabels=labels,
                linewidths=.5, ax=ax2, cbar_kws={"shrink": .3}, annot=True)
    ax2.set_title(title2)
    fig.tight_layout()
    plt.show()
Пример #25
0
    def _process(self,data):
        for x in data:
            
            if data[x][1] not in self.data:
                #prepares the data to visualise the xcor matrix of a specific batch number.
                self.data[data[x][1]]={}
                self.data[data[x][1]]['matrix']=numpy.identity(self.size)
                self.data[data[x][1]]['ro_count']=0
            
            self.data[data[x][1]]['matrix'][(data[x][2][1],data[x][2][0])]=data[x][0]
            #self.addToProvState('batch_'+str(data[x][1]),self.data[data[x][1]]['matrix'],metadata={'matrix':str(self.data[data[x][1]]['matrix'])},dep=['batch_'+str(data[x][1])],ignore_inputs=False)
            self.data[data[x][1]]['ro_count']+=1
            
            if self.data[data[x][1]]['ro_count']==(self.size*(self.size-1))/2:
                matrix=self.data[data[x][1]]['matrix']
                
                d = pd.DataFrame(data=matrix,
                 columns=range(0,self.size),index=range(0,self.size))
                
                mask = numpy.zeros_like(d, dtype=numpy.bool)
                mask[numpy.triu_indices_from(mask)] = True

                # Set up the matplotlib figure
                f, ax = plt.subplots(figsize=(11, 9))

                # Generate a custom diverging colormap
                cmap = sns.diverging_palette(220, 10, as_cmap=True)

                # Draw the heatmap with the mask and correct aspect ratio
                sns.heatmap(d, mask=mask, cmap=cmap, vmax=1,
                    square=True,
                    linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
                
                sns.plt.savefig("./plots/"+str(data[x][1])+"_plot.png") 
                self.write('output',(matrix,data[x][1]),metadata={'matrix':str(d),'batch':str(data[x][1])},dep=['batch_'+str(data[x][1])])
Пример #26
0
def main():

    # Load list of pointing IDs
    todo_file = rawdata_dir + 'todo_list.ascii.dat'
    ID_list   = np.genfromtxt(todo_file, skip_header=1, usecols=[0], unpack=True,
                            dtype=str)
    N_los = len(ID_list)

    # Load bins centers
    bins_file   = 'rbins.ascii.dat'
    bin_centers = np.genfromtxt(bins_file, skip_header=1, usecols=[2], unpack=True)
    N_bins      = len(bin_centers)

    # Round bin centers to three decimal places
    bin_centers = np.round(bin_centers, 3)

    # Make array of column names for pandas Dataframe
    col_names = []

    for i in range(N_bins):
        name = str(bin_centers[i])
        col_names.append(name)

    # Recast as array
    col_names = np.asarray(col_names)

    # Create list of png's for use in making gif
    png_list =[]

    # Calculate correlation matrix for each l.o.s.
    for ID in ID_list:

        # Load counts from 1000 mocks with pandas
        # Each row is a mock, each column is a bin
        counts_filename = counts_dir + 'counts_all_' + ID + '.dat'
        DF = pd.read_csv(counts_filename, sep='\s+', names=col_names)

        # Calculate correlation matrix
        corr = DF.corr()

        # plot heatmap of matrix
        plt.clf()
        sns.set(style="white")
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        f, ax = plt.subplots(figsize=(11, 9))
        cmap = sns.diverging_palette(145, 280, s=85, l=25, n=7, as_cmap=True)
        sns.heatmap(corr, mask=mask, cmap=cmap,square=True, annot=True,
                    xticklabels=col_names, yticklabels=col_names, linewidths=.5,
                    cbar_kws={"shrink": .5}, ax=ax, vmin=-1.0, vmax=1.0)
        plt.title('Correlation Matrix for l.o.s. ' + ID, fontsize=20)
        plt.xlabel('Bin Center (kpc)', fontsize=18)
        plt.ylabel('Bin Center (kpc)', fontsize=18)

        fig_name = plots_dir + 'corr_matrix_' + ID + '.png'
        plt.savefig(fig_name)
        png_list.append(fig_name)

    gif_name = plots_dir + 'corr_matrix.gif'
    GIF_MOVIE(png_list, gif_name)
Пример #27
0
def find_collinearity_columns(correlation):
    """
    本函数找出多重共线性的列。
    基本思路:
    0.首先看correlation是否满秩。如果不满秩,说明存在多重共线性。
    1. 找到correlation里面绝对值最大的row_idx,和col_idx,假设为A和C.
    2.计算A列/C列与其他列的相关系数绝对值的均值,如果A列与其他列相关系数更大,则剔除A列,反之亦然。记录下来A列名
    重复以上步骤
    @params: correlation, 相关系数矩阵.dataframe
    @returns: list of column names.
    """
    bad_columns = []
    while True:
        rank = np.linalg.matrix_rank(correlation.values)
        if rank == correlation.shape[0]:
            break

        correlation_copy = correlation.copy()
        correlation = correlation.abs()
        correlation.values[np.triu_indices_from(correlation.values, 0)] = 0.0  # 把上三角(包括对角线部分)设置为0.
        col_idx, row_idx = correlation.unstack().argmax()  # (col_idx, row_idx)
        if correlation_copy.ix[row_idx, :].mean() > correlation_copy.ix[:, col_idx].mean():
            bad_column = row_idx
        else:
            bad_column = col_idx
        bad_columns.append(bad_column)
        # 把该列名称从相关系数矩阵的行/列里去掉
        correlation_copy.drop(bad_column, axis=0, inplace=True)
        correlation_copy.drop(bad_column, axis=1, inplace=True)
        correlation = correlation_copy
    return bad_columns
def plot_feature_corr(X, f_sz = (11, 9)):
	"""
	Purpose: plot a correlation matrix for the features in X
	Inputs:	X: a pandas dataframe of feature values
			f_sz: a tuple for the figure size
	Output: the correlation matrix of X
	"""
	sns.set(style="white")

	# Compute the correlation matrix
	corr = X.corr()

	# Generate a mask for the upper triangle
	mask = np.zeros_like(corr, dtype=np.bool)
	mask[np.triu_indices_from(mask)] = True
	
	# Set up the matplotlib figure
	f, ax = plt.subplots(figsize= f_sz)
	
	# Generate a custom diverging colormap
	cmap = sns.diverging_palette(220, 10, as_cmap=True)

	# Draw the heatmap with the mask and correct aspect ratio
	sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
		square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

	return corr
    def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console):
        Tk.Frame.__init__(self, master)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.evaluator = evaluator
        self.df = df
        self.console = console

        frame_train = Tk.Frame(self)
        frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15)
        plt.figure(figsize=(12, 20))
        plt.subplot(111)

        # 背景色白色
        sns.set(style="white")
        # 特征关联矩阵(矩阵里不仅包含特征,还包括类别)
        corr = df.corr()
        # 隐藏矩阵的上三角
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # 画图
        f, ax = plt.subplots(figsize=(11, 11))
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
        plt.xticks(rotation=-90)
        plt.yticks(rotation=0)
        plt.title("Cardiotocography \"Feature-Feature\" & \"Feature-Label\" Correlations")
        self.attach_figure(plt.gcf(), frame_train)
Пример #30
0
def threshold_matrix(M, cost):
    '''
    M is the full association matrix.
    cost is the percentage (0 to 100) at which you'd like to threshold
    
    threshold_matrix first creates a copy of the input matrix, then
    sets all diagonal values to 0. It next calculates the minimum spanning tree,
    and ensures that those edges are *always* included in the thresholded
    matrix.
    
    then sets all values below the 
    appropriate percentile to 0
    '''
    # Make a copy of the matrix
    thr_M = np.copy(M)
    
    # Set all diagonal values to -999    
    thr_M[np.diag_indices_from(thr_M)] = -999
    
    # Calculate minmum spanning tree
    G = nx.from_numpy_matrix(M)
    mst = nx.minimum_spanning_tree(G, weight='weight'*-1)
    
    # Calculate the threshold value
    thr = np.percentile(thr_M[np.triu_indices_from(thr_M, k=1)], cost)
    
    # Set all values that are less than the threshold to 0
    thr_M[thr_M < thr] = 0
       
    # Set all values that are not zero to 1
    thr_M[thr_M != 0] = 1

    return thr_M
Пример #31
0
 def remove_lower_triangle(matrix):
     """
     remove all values in the lower triangle of a matrix
     """
     return matrix[np.triu_indices_from(matrix)].A1
def dynamic_heatmap(df, columns, fontsize=20, annot=False, palette=None, figsize=(15, 10), squaresize=500):
    """Plots a heatmap that changes size values depending on correlation Adapted from:
    https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec"""

    plt.figure(figsize=figsize)
    corr = df[columns].corr()
    sns.set(style="dark")
    grid_bg_color = sns.axes_style()['axes.facecolor']

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    corr = pd.melt(corr.reset_index(),
                   id_vars='index')  # Unpivot the dataframe, so we can get pair of arrays for x and y
    corr.columns = ['x', 'y', 'value']

    x = corr['x']
    y = corr['y']
    size = corr['value'].abs()

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=figsize)
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right');

    # Mapping from column names to integer coordinates
    x_labels = [v for v in sorted(x.unique())]
    y_labels = [v for v in sorted(y.unique())]
    x_to_num = {p[1]: p[0] for p in enumerate(x_labels)}
    y_to_num = {p[1]: p[0] for p in enumerate(y_labels)}

    size_scale = squaresize

    if palette:
        n_colors = len(palette)
    else:
        n_colors = 256  # Use 256 colors for the diverging color palette
        palette = sns.diverging_palette(20, 220, n=n_colors) # Create the palette
    color_min, color_max = [-1,
                            1]  # Range of values that will be mapped to the palette, i.e. min and max possible correlation
    color = corr["value"]

    def value_to_color(val):
        val_position = float((val - color_min)) / (
                    color_max - color_min)  # position of value in the input range, relative to the length of the input range
        ind = int(val_position * (n_colors - 1))  # target index in the color palette
        return palette[ind]

    plot_grid = plt.GridSpec(1, 15, hspace=0.2, wspace=0.1)  # Setup a 1x15 grid
    ax = plt.subplot(plot_grid[:, :-1])  # Use the leftmost 14 columns of the grid for the main plot

    ax.scatter(
        x=x.map(x_to_num),  # Use mapping for x
        y=y.map(y_to_num),  # Use mapping for y
        s=size * size_scale,  # Vector of square sizes, proportional to size parameter
        c=color.apply(value_to_color),  # Vector of square colors, mapped to color palette
        marker='s'  # Use square as scatterplot marker
    )

    # Show column labels on the axes
    ax.set_xticks([x_to_num[v] for v in x_labels])
    ax.set_xticklabels(x_labels, rotation=45, horizontalalignment='right')
    ax.set_yticks([y_to_num[v] for v in y_labels])
    ax.set_yticklabels(y_labels)
    #     ax.set_fontsize(font_scale)
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(fontsize)

    numbers = corr['value'].round(decimals=2)

    if annot:
        for i, txt in enumerate(numbers):
            annot_font_size = int(fontsize * size[i] * annot)
            ax.annotate(txt, (x.map(x_to_num)[i], y.map(x_to_num)[i]),
                        horizontalalignment="center", verticalalignment="center",
                        color=grid_bg_color, fontweight="black", fontsize=annot_font_size)

    ax.grid(False, 'major')
    ax.grid(True, 'minor')
    ax.set_xticks([t + 0.5 for t in ax.get_xticks()], minor=True)
    ax.set_yticks([t + 0.5 for t in ax.get_yticks()], minor=True)
    ax.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5])
    ax.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5])

    # Add color legend on the right side of the plot
    ax = plt.subplot(plot_grid[:, -1])  # Use the rightmost column of the plot

    col_x = [0] * len(palette)  # Fixed x coordinate for the bars
    bar_y = np.linspace(color_min, color_max, n_colors)  # y coordinates for each of the n_colors bars

    bar_height = bar_y[1] - bar_y[0]
    ax.barh(
        y=bar_y,
        width=[5] * len(palette),  # Make bars 5 units wide
        left=col_x,  # Make bars start at 0
        height=bar_height,
        color=palette,
        linewidth=0
    )
    ax.set_xlim(1, 2)  # Bars are going from 0 to 5, so lets crop the plot somewhere in the middle
    ax.grid(False)  # Hide grid
    ax.set_xticks([])  # Remove horizontal ticks
    ax.set_yticks(np.linspace(min(bar_y), max(bar_y), 3))  # Show vertical ticks for min, middle and max
    ax.yaxis.tick_right()  # Show vertical ticks on the right
    plt.show()
Пример #33
0
def correlation_reduction_worker(corrMatrix,
                                 target,
                                 threshold,
                                 reductionMetrics=''):
    '''
    This function works to drop vars over threshold and heat map a pre and post correlation matrix of any sorts.
    The threshold can also be specify through the scenario file by "custom_correlationReduction_thresh"
    Parameters:
               corrMatrix - correlation matrix
                   target - dataframe containing variables' correlation with your target variable
                threshold - your threshold for variable reduction
         reductionMetrics - one of four correlation reduction metrics:
                            Pearson, Spearmans, EuclideanDistances or LogisticRegression
    Returns:
              droppedInfo - dataFrame of variables that were dicarded
                  corrmat - post correlation matrix of any sorts
    Writes:
              droppedInfo - dataFrame of variables that were dicarded
                  corrmat - post correlation matrix of any sorts
    '''

    drop = []
    seaborn.set(context="paper", font="monospace")
    corrmat = abs(corrMatrix.copy(deep=True))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 8))
    ax.set_yticks([])
    # Draw the heatmap using seaborn
    mask = np.zeros_like(corrmat, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    with seaborn.axes_style("white"):
        seaborn.heatmap(corrmat,
                        vmax=1,
                        mask=mask,
                        square=True,
                        xticklabels=20,
                        yticklabels=20,
                        cmap='Blues')
        ax.set_title(
            "{0} - Original Correlation Matrix".format(reductionMetrics))

    if not os.path.isfile('../outputs/variable_reduction/{}_Original.png'.
                          format(reductionMetrics)):
        f.savefig('../outputs/variable_reduction/{}_Original.png'.format(
            reductionMetrics))

    # Correlation Threshold
    mid = target.drop(target.index[0], 1).T
    mid['target_abs'] = abs(mid[target.index[0]])
    mid.sort_values('target_abs', inplace=True, ascending=False)
    order = list(mid.index)
    corrMatrix = corrMatrix.reindex(order, axis=1)
    corrMatrix = corrMatrix.reindex(order)
    column_list = corrMatrix.keys()  ###
    for col in column_list:  ###
        # This is a temporary drop list that'll be emptied in each iteration, we use it to store the dropped vars and apply to corrMatrix drop
        drop_tmp = []
        if col in corrMatrix.keys(
        ):  # corrMatrix does inplace drop of columns and rows in each iteration, need to check whether col is still in corrMatrix
            for i in range(len(corrMatrix)):  # Iterating through all the vars
                if col != corrMatrix.keys(
                )[i]:  # Make sure col is not compared with itself
                    if abs(corrMatrix[col]
                           [i]) > threshold:  # Correlation Threshold Checking
                        # This is a cumulative drop list that'll be used as the output to show the summary of drop vars
                        # Append [dropped_var, col, corr(dropped_var, target), corr(col, target)]
                        drop.append([
                            corrMatrix.keys()[i], col,
                            target[corrMatrix.keys()[i]].values[0],
                            target[col].values[0]
                        ])
                        drop_tmp.append(
                            corrMatrix.keys()[i])  # Append dropped_var
            corrMatrix.drop(drop_tmp, axis=1, inplace=True)  # Drop columns
            corrMatrix.drop(drop_tmp, axis=0, inplace=True)  # Drop rows
    corrmat = abs(corrMatrix.copy(deep=True))

    droppedInfo = pd.DataFrame(drop,
                               columns=[
                                   'discarded_variable', 'correlated_to',
                                   'discarded_variable_correlation_to_target',
                                   'correlated_to_correlation_to_target'
                               ])

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 8))
    ax.set_yticks([])
    # Draw the heatmap using seaborn
    mask = np.zeros_like(corrmat, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    with seaborn.axes_style("white"):
        seaborn.heatmap(corrmat,
                        vmax=1,
                        mask=mask,
                        square=True,
                        xticklabels=20,
                        yticklabels=20,
                        cmap='Blues')
        ax.set_title("{} - Post Reduction Correlation Matrix - {}".format(
            reductionMetrics, threshold))

    f.savefig(
        '../outputs/variable_reduction/{}_Post_Reduction_Correlation_Matrix_{}.png'
        .format(reductionMetrics,
                str(threshold).replace('.', '')))

    return corrmat, droppedInfo
Пример #34
0
def pipeline_for_single_instance(logger, analysis_dir, main: Record,
                                 finetune: List[Record], by: str,
                                 gt: np.ndarray):
    logger.info("Analysing results for {}".format(analysis_dir))
    main_df = main.validation_acc_dataframe(by)
    main_archit = main.grouping_subgraph_training_dataframe(by)
    main_grouping = main.grouping_numpy

    os.makedirs(analysis_dir, exist_ok=True)

    # Save raw data
    main_df.to_csv(os.path.join(analysis_dir, "val_acc_all_epochs.csv"),
                   index=True)
    np.savetxt(os.path.join(analysis_dir, "group_info.txt"), main_grouping,
               "%d")

    # correlation between subgraphs
    corr_matrix = main_df.corr().values
    heatmap(corr_matrix, filepath=os.path.join(analysis_dir, "corr_heatmap"))
    np.savetxt(os.path.join(analysis_dir, "corr_heatmap.txt"), corr_matrix)

    # Consecutive tau (single)
    consecutive_taus = get_consecutive_rank_tau(main_df)
    lineplot([np.array(list(zip(main_df.index[1:], consecutive_taus)))],
             filepath=os.path.join(analysis_dir, "consecutive_tau_single"))

    # GT rank (for color reference)
    gt_rank = rankdata_greater(gt)
    gt_rank_color = 1 - gt_rank / EXPECTED_SUBGRAPH_NUMBER
    # in some cases, it could be a subset of 64 subgraphs; process this later

    # Acc variance (lineplot)
    acc_curves = [
        np.array(list(zip(main_df.index, main_df[i]))) for i in main_df.columns
    ]
    subgraph_markers = [[] for _ in range(EXPECTED_SUBGRAPH_NUMBER)]
    if len(main.groups) != len(main.columns):  # hide it for ground truth
        for i, (_, row) in enumerate(main_archit.iterrows()):
            for k in filter(lambda k: k >= 0, row.values):
                subgraph_markers[k].append(i)
    else:
        logger.info("Markers hidden because groups == columns")

    lineplot(acc_curves,
             filepath=os.path.join(analysis_dir, "acc_curve_along_epochs"),
             color=[gt_rank_color[i] for i in main_df.columns],
             alpha=0.7,
             markers=[subgraph_markers[i] for i in main_df.columns],
             fmt=["-D"] * len(acc_curves))

    # Rank version of df
    df_rank = main_df.apply(rankdata_greater, axis=1, result_type="expand")
    df_rank.columns = main_df.columns

    # Rank variance (lineplot)
    rank_curves = [
        np.array(list(zip(df_rank.index, df_rank[i]))) for i in df_rank.columns
    ]
    lineplot(rank_curves,
             filepath=os.path.join(analysis_dir, "rank_curve_along_epochs"),
             color=[gt_rank_color[i] for i in df_rank.columns],
             alpha=0.7,
             inverse_y=True,
             markers=subgraph_markers)

    # Rank variance for top-5 subgraphs found at half and end
    # recalculate for original order
    for loc in [len(main_df) // 2, len(main_df) - 1]:
        selected_rank_curves = [
            rank_curves[i] for i in np.argsort(-main_df.iloc[loc])[:5]
        ]
        lineplot(selected_rank_curves,
                 inverse_y=True,
                 filepath=os.path.join(
                     analysis_dir, "rank_curves_along_epochs_for_ep{}".format(
                         main_df.index[loc])))

    # Rank variance (boxplot), sorted by the final rank
    boxplot(sorted(df_rank.values.T, key=lambda d: d[-1]),
            filepath=os.path.join(
                analysis_dir, "rank_boxplot_along_epochs_sorted_final_rank"),
            inverse_y=True)

    gt_order = np.argsort(-gt)

    # Group info
    np.savetxt(os.path.join(analysis_dir, "group_info_sorted_gt.txt"),
               main_grouping[gt_order], "%d")

    # Rank variance (boxplot), sorted by ground truth
    boxplot([df_rank[i] for i in gt_order if i in df_rank.columns],
            inverse_y=True,
            filepath=os.path.join(analysis_dir,
                                  "rank_boxplot_along_epochs_sorted_gt_rank"))
    boxplot([df_rank[i][-10:] for i in gt_order if i in df_rank.columns],
            inverse_y=True,
            filepath=os.path.join(
                analysis_dir,
                "rank_boxplot_along_epochs_sorted_gt_rank_last_10"))

    # Tau every epoch
    gt_tau_data = get_tau_along_epochs(main_df, gt, main.columns)
    report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window",
                            gt_tau_data)
    lineplot([stack_with_index(main_df.index, gt_tau_data)],
             filepath=os.path.join(analysis_dir, "tau_curve_along_epochs"))

    if finetune:
        # Finetune curves
        for data in finetune:
            try:
                finetune_step = data.finetune_step
                if by == "epochs":
                    finetune_step //= 196
                half_length = len(main_df.loc[main_df.index <= finetune_step])
                finetune_df = data.validation_acc_dataframe(
                    by, cutoff=finetune_step).iloc[:half_length]
                if finetune_step < min(
                        main_df.index) - 1 or finetune_step > max(
                            main_df.index) + 1:
                    continue
                finetune_df.index += finetune_step
                finetune_curves = [
                    np.array([[finetune_step, main_df.loc[finetune_step, i]]] +
                             list(zip(finetune_df.index, finetune_df[i])))
                    for i in main_df.columns
                ]
                finetune_tau_curve = get_tau_along_epochs(
                    finetune_df, gt, data.columns)
                finetune_colors = [
                    gt_rank_color[i] for i in finetune_df.columns
                ]
                logger.info(
                    "Finetune step {}, found {} finetune curves".format(
                        finetune_step, len(finetune_curves)))
                lineplot(
                    [c[:half_length] for c in acc_curves] + finetune_curves,
                    filepath=os.path.join(
                        analysis_dir,
                        "acc_curve_along_epochs_finetune_{}".format(
                            finetune_step)),
                    color=[gt_rank_color[i]
                           for i in main_df.columns] + finetune_colors,
                    alpha=0.7,
                    fmt=["-"] * len(acc_curves) + [":"] * len(finetune_curves))
                lineplot([
                    stack_with_index(main_df.index, gt_tau_data)[:half_length],
                    np.concatenate((np.array([[
                        finetune_step, gt_tau_data[half_length - 1]
                    ]]), stack_with_index(finetune_df.index,
                                          finetune_tau_curve)))
                ],
                         filepath=os.path.join(
                             analysis_dir,
                             "tau_curve_along_epochs_finetune_{}".format(
                                 finetune_step)),
                         color=["tab:blue", "tab:blue"],
                         alpha=1,
                         fmt=["-", ":"])
            except ValueError:
                pass

    # Tau every epoch group by groups
    grouping_info_backup = main.grouping_info.copy()
    divide_group = main.group_number == 1 and len(main.columns) == 64
    for partition_file in [None] + list(os.listdir("assets")):
        suffix = ""
        if partition_file is not None:
            if not partition_file.startswith("partition"):
                continue
            if not divide_group:
                continue
            suffix = "_" + os.path.splitext(partition_file)[0]
            # regrouping
            main.grouping_info = {
                idx: g
                for idx, g in enumerate(
                    np.loadtxt(os.path.join("assets", partition_file),
                               dtype=np.int))
            }

        tau_curves_by_groups = get_tau_curves_by_groups(
            main_df, gt, main.grouping_numpy, main.groups)
        tau_curves_by_groups_mean = [
            np.mean(tau_curves_by_groups[cur]) for cur in main.groups
        ]
        tau_curves_by_groups_std = [
            np.std(tau_curves_by_groups[cur]) for cur in main.groups
        ]
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-By-Groups-Mean{}".format(suffix),
                                np.array(tau_curves_by_groups_mean))
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-By-Groups-Std{}".format(suffix),
                                np.array(tau_curves_by_groups_std))
        tau_curves_by_groups_for_plt = [
            stack_with_index(main_df.index, tau_curves_by_groups[cur])
            for cur in main.groups
        ]

        pd.DataFrame(tau_curves_by_groups,
                     columns=main.groups,
                     index=main_df.index).to_csv(
                         os.path.join(
                             analysis_dir,
                             "tau_curves_by_groups{}.csv".format(suffix)))
        lineplot(tau_curves_by_groups_for_plt,
                 filepath=os.path.join(
                     analysis_dir, "tau_curves_by_groups{}".format(suffix)))

        # Acc curves (by group)
        with MultiPageContext(
                os.path.join(
                    analysis_dir, "acc_curve_along_epochs_group_each{}".format(
                        suffix))) as pdf:
            for g in range(main.group_number):
                subgraphs = np.where(main.grouping_numpy == g)[0]
                gt_rank_group = [gt_rank_color[i] for i in subgraphs]
                subgraph_names = list(
                    map(convert_subgraph_index_to_label, subgraphs))
                subgraph_names_ranks = [
                    "{} (Rank {})".format(name, gt_rank[i])
                    for name, i in zip(subgraph_names, subgraphs)
                ]
                # cannot leverage acc_curves, because it's a list, this can be a subset, which cannot be used as index
                lineplot([
                    np.array(list(zip(main_df.index, main_df[i])))
                    for i in subgraphs
                ] + [
                    stack_with_index(main_df.index,
                                     [gt[i]] * len(main_df.index))
                    for i in subgraphs
                ],
                         context=pdf,
                         color=gt_rank_group * 2,
                         alpha=0.8,
                         labels=subgraph_names_ranks,
                         fmt=["-D"] * len(subgraphs) + ["--"] * len(subgraphs),
                         markers=[subgraph_markers[i]
                                  for i in subgraphs] + [[]] * len(subgraphs),
                         title="Group {}, Subgraph {} -- {}".format(
                             g, "/".join(map(str, subgraphs)),
                             "/".join(subgraph_names)))

    main.grouping_info = grouping_info_backup

    # Tau among steps
    for k in (10, 64):
        max_tau_calc = min(k, len(main_df))
        tau_correlation = np.zeros((max_tau_calc, max_tau_calc))
        for i in range(max_tau_calc):
            for j in range(max_tau_calc):
                tau_correlation[i][j] = stats.kendalltau(
                    main_df.iloc[-i - 1], main_df.iloc[-j - 1])[0]
        heatmap(tau_correlation,
                filepath=os.path.join(analysis_dir,
                                      "tau_correlation_last_{}".format(k)))
        np.savetxt(
            os.path.join(analysis_dir,
                         "tau_correlation_last_{}.txt".format(k)),
            tau_correlation)
        tau_correlation = tau_correlation[np.triu_indices_from(tau_correlation,
                                                               k=1)]
        report_mean_std_max_min(analysis_dir, logger,
                                "Tau-as-Corr-Last-{}".format(k),
                                tau_correlation)

    # Calculate best tau and log
    ref_gt_acc, ref_gt_acc_tau = get_tau_along_epochs_combining_best_groups(
        main_df, gt, main_grouping, main.groups, main.columns)
    pd.DataFrame(ref_gt_acc).to_csv(
        os.path.join(analysis_dir,
                     "acc_epochs_combining_different_epochs_sorted_gt.csv"))
    lineplot(
        [stack_with_index(np.arange(len(ref_gt_acc_tau)), ref_gt_acc_tau)],
        filepath=os.path.join(
            analysis_dir,
            "tau_curve_epochs_sorted_combining_different_epochs"))

    # Show subgraph for each batch
    scatterplot([
        stack_with_index(main_archit.index, main_archit[col])
        for col in main_archit.columns
    ],
                filepath=os.path.join(analysis_dir,
                                      "subgraph_id_for_each_batch_validated"))

    # Substituted with ground truth rank
    scatterplot([
        stack_with_index(main_archit.index, gt_rank[main_archit[col]])
        for col in main_archit.columns
    ],
                filepath=os.path.join(
                    analysis_dir, "subgraph_rank_for_each_batch_validated"),
                inverse_y=True)

    # Top-K-Rank
    top_acc, top_rank = get_top_k_acc_rank(main_df.values, gt)
    plot_top_k_variance_chart(os.path.join(analysis_dir, "top_k_along_epochs"),
                              main_df.index, top_acc, top_rank, gt, (1, 3))

    # Observe last window (for diff. epochs)
    for k in (
            10,
            64,
    ):
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-In-Window-Last-{}".format(k),
                                gt_tau_data[-k:])
        for v in (1, 3):
            report_mean_std_max_min(analysis_dir, logger,
                                    "Top-{}-Rank-Last-{}".format(v, k),
                                    top_rank[-k:, v - 1])
Пример #35
0
df.drop(["estimation"], axis=1, inplace=True)


# aux = df.diff() # apply diff method to compute corr over non-stationary
# df=aux


f, (ax1,ax2) = plt.subplots(2,1, figsize=(6, 12), sharex=False)

corr = df.corr() # Pearson
corrdiff = df.diff().corr()

# corr = df.corr(method='kendall')
# corr = df.corr(method='spearman')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask,k=1)] = True #k=1 p/ mostrar diagonal
with sns.axes_style("white"):
	# ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True)

	sns.heatmap(
	    corr, 
	    mask=mask,
	    vmin=-1, vmax=1, center=0,
	    cmap=sns.diverging_palette(20, 220, n=200),
	    square=True,
	    annot=True,
	    fmt = '.2f',
	    linewidths=.5,
	    ax=ax1
	)
Пример #36
0
def scores(key, paths, config, as_dataframe=False, algo_idx=None):
    # print(key, paths)
    # key = 'enettv_0.1_0.5_0.1'
    # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1']
    key_parts = key.split("_")
    algo = key_parts[algo_idx] if algo_idx is not None else None
    key_parts.remove(algo)
    if len(key_parts) > 0:
        try:
            params = [float(p) for p in key_parts]
        except:
            params = [None, None, None]
    print(algo, params)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None

    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits))]
    auc_splits = [roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits))]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1,alternative = 'greater')
    pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1,alternative = 'greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5,alternative = 'greater')
    pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5,alternative = 'greater')
    pvalue_bacc = binom_test(success[0]+success[1], s[0] + s[1], p=0.5,alternative = 'greater')

    for item in values:
        print(item["beta"].shape)

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T
    print (betas.shape)
    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
                array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
                for i in range(betas.shape[0])])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0]) for j in range(i+1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['algo'] = algo
    scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params

    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores['pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores['pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys()))

    return scores
Пример #37
0
def make_symmetric_random(NUM_GENES):
    tmp = np.random.rand(NUM_GENES, NUM_GENES)
    sym = (tmp + tmp.T) / 2
    sym[np.triu_indices_from(sym)] = -sym[np.triu_indices_from(sym)]
    return sym
Пример #38
0
def build_ts_matric(df_init, win=20, lag=0, columns=list, rename=dict, period='fullyear'):
    #%%
    '''
    period = ['fullyear', 'summer60days', 'pre60days']
    '''
    splits = df_init.index.levels[0]
    dates_full_orig = df_init.loc[0].index
    dates_RV_orig   = df_init.loc[0].index[df_init.loc[0]['RV_mask']==True]
    if columns is None:
        columns = df_init.columns

    df_cols = df_init[columns]
    TrainIsTrue = df_init['TrainIsTrue']

    list_test = []
    for s in range(splits.size):
        TestIsTrue = TrainIsTrue[s]==False
        list_test.append(df_cols.loc[s][TestIsTrue])

    df_test = pd.concat(list_test).sort_index()
    # shift precursor vs. tmax
    for c in df_test.columns[1:]:
        df_test[c] = df_test[c].shift(periods=-lag)

    # bin means
    df_test = df_test.resample(f'{win}D').mean()

    if period=='fullyear':
        dates_sel = dates_full_orig.strftime('%Y-%m-%d')
    elif period == 'summer60days':
        dates_sel = dates_RV_orig.strftime('%Y-%m-%d')
    elif period == 'pre60days':
        dates_sel = (dates_RV_orig - pd.Timedelta(60, unit='d')).strftime('%Y-%m-%d')

    # after resampling, not all dates are in their:
    dates_sel =  pd.to_datetime([d for d in dates_sel if d in df_test.index] )
    df_period = df_test.loc[dates_sel, :].dropna()

    if rename is not None:
        df_period = df_period.rename(rename, axis=1)

    corr, sig_mask, pvals = corr_matrix_pval(df_period, alpha=0.01)

    # Generate a mask for the upper triangle
    mask_tri = np.zeros_like(corr, dtype=np.bool)
    mask_tri[np.triu_indices_from(mask_tri)] = True
    mask_sig = mask_tri.copy()
    mask_sig[sig_mask==False] = True

    # removing meaningless row and column
    cols = corr.columns
    corr = corr.drop(cols[0], axis=0).drop(cols[-1], axis=1)
    mask_sig = mask_sig[1:, :-1]
    mask_tri = mask_tri[1:, :-1]
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 10))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, n=9, l=30, as_cmap=True)

    ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1E99, center=0,
                square=True, linewidths=.5,
                 annot=False, annot_kws={'size':30}, cbar=False)


    sig_bold_labels = sig_bold_annot(corr, mask_sig)
    # Draw the heatmap with the mask and correct aspect ratio
    ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .8},
                 annot=sig_bold_labels, annot_kws={'size':30}, cbar=False, fmt='s')

    ax.tick_params(axis='both', labelsize=15,
                   bottom=True, top=False, left=True, right=False,
                   labelbottom=True, labeltop=False, labelleft=True, labelright=False)

    ax.set_xticklabels(corr.columns, fontdict={'fontweight':'bold',
                                               'fontsize':25})
    ax.set_yticklabels(corr.index, fontdict={'fontweight':'bold',
                                               'fontsize':25}, rotation=0)
    #%%
    return
Пример #39
0
                if filename:
                    with open(filename[0]) as result_file:
                        first_line = result_file.readline()
                        test_size = int(
                            re.search("Size of test set: (\d+)",
                                      first_line).group(1))

                        if t == "total":
                            stable_core[classifier][algorithm][k] = int(
                                re.search("Stable Core: (\d+)",
                                          first_line).group(1)) / test_size

                        array = np.loadtxt(result_file,
                                           dtype=int,
                                           delimiter=",")
                        array = 1.0 - (array[np.triu_indices_from(array, 1)] /
                                       test_size)

                        overlap[t][classifier][algorithm][k] = np.mean(array)
                        overlap_std[t][classifier][algorithm][k] = np.std(
                            array)
                else:
                    print("Missing overlap results for", t, algorithm,
                          classifier, dataset)

                all_datapoints = np.empty(0)
                all_cores = []
                count = 0
                for filename in glob(
                        f"{basedir}/compare_embedding_errors/{t}*/self_{classifier}*{algorithm}*{dataset}*.txt"
                ):
Пример #40
0
def corr(data,
         corr_method='spearman',
         annot=False,
         mask=True,
         line_width=1,
         line_color='black',
         color_grades=5,
         auto_sizing=True,
         palette='default',
         style='astetik',
         dpi=72,
         title='',
         sub_title='',
         x_label='',
         y_label='',
         legend=True,
         x_scale='linear',
         y_scale='linear',
         x_limit=None,
         y_limit=None,
         save=False):

    '''CORRELATION HEATMAP

    This is best used with less than 50 variables in the dataset.
    For best results, column labels should be clear and not too long.

   Inputs: a dataframe with several columns
   Features: Both categorical and continuous features will be used

    1. USE
    ======
    ast.compare(data=patients,
                x='hospital_stays',
                y=['died_hospital','died_out'],
                label_col='religion',
                transform=True)

    1. USE
    ======
    ast.box(data=patients,
            x='insurance',
            y='age',
            hue='expired')

    2. PARAMETERS
    =============
    2.1 INPUT PARAMETERS
    --------------------
    data :: a pandas dataframe

    --------------------
    2.2. PLOT PARAMETERS
    --------------------
    corr_method :: The method that will be used for the correlation:
                    - 'pearson' : standard correlation coefficient
                    - 'kendall' : Kendall Tau correlation coefficient
                    - 'spearman' : Spearman rank correlation

    annotation :: True if each cell will be annotated with the value

    mask :: If set to False, a rectangular shape will be drawn instead
            of a triangular shapeself.

    line_width :: the width of the white lines between each element. Better
                  to set small when there are really many items.

    line_color :: the color of the lines between the elements e.g. 'black'

    auto_sizing :: If not True, then should be int value in inches which is used
                   for both width and height.

    color_grades :: The number of colors/shades to use in total. 5 is default.
                    Generally the best results come with 3 or 5 or 7, but looks
                    better with more colors.

    ----------------------
    2.3. COMMON PARAMETERS
    ----------------------
    palette :: One of the astetik palettes:
                'default'
                'colorblind'
                'blue_to_red'
                'blue_to_green'
                'red_to_green'
                'green_to_red'
                'violet_to_blue'
                'brown_to_green'
                'green_to_marine'

                Or use any cmap, seaborn or matplotlib
                color or palette code, or hex value.

    style :: Use one of the three core styles:
                'astetik'     # white
                '538'         # grey
                'solarized'   # sepia

              Or alternatively use any matplotlib or seaborn
              style definition.

    dpi :: the resolution of the plot (int value)

    title :: the title of the plot (string value)

    sub_title :: a secondary title to be shown below the title

    x_label :: string value for x-axis label

    y_label :: string value for y-axis label

    x_scale :: 'linear' or 'log' or 'symlog'

    y_scale :: 'linear' or 'log' or 'symlog'

    x_limit :: int or list with two ints

    y_limit :: int or list with two ints

    outliers :: Remove outliers using either 'zscore' or 'iqr'
    '''

    # # # # # PREP STARTS # # # # #
    data = data.corr(method=corr_method)

    if mask == True:
        mask = np.zeros_like(data)
        mask[np.triu_indices_from(mask)] = True
        line_color = 'white'
    else:
        mask = None
    # # # # # PREP ENDS # # # # #

    # HEADER STARTS >>>
    palette = _header(palette,
                      style,
                      n_colors=color_grades,
                      dpi=dpi)

    if auto_sizing == True:
        size = data.shape[0] / 2 + 5

    # PLOT
    p, ax = plt.subplots(figsize=(size, size))

    p = sns.heatmap(data,
                    mask=mask,
                    linewidths=line_width,
                    linecolor=line_color,
                    cmap=palette,
                    annot=annot)

    # HEADER
    _titles(title, sub_title=sub_title)
    _footer(p, x_label, y_label, save=save, tight=False, despine=False)

    p.set_xticklabels(data, rotation=90)
    p.set_yticklabels(data, rotation=0)
def feature_corr(Ord, ftlist, taglist, n_class, cross='feature', corr='dtw'):
    ftlist = ftlist.T[Ord].apply(pd.Series).T
    ftrs = {'xa':[], 'ya':[], 'za':[], 'xw':[], 'yw':[], 'zw':[]}
    for i, k in enumerate(ftrs.keys()):
        ftrs[k] = ftlist.iloc[i]

    # only do one axis (zw) for sake of analysis across activity
    tag_class, feature = {a:[] for a in n_class}, {a:[] for a in n_class}

    for k, v in taglist.items():
        tag_class[v[Ord]] += [k]

    target_df = ftrs['zw']
    feature_no = len(target_df.iloc[0])

    for act, nums in tag_class.items():
        for no in nums:
            feature[act].append(target_df.loc[no])
        feature[act] = np.array(feature[act])

    # corr across acts for different features
    # feature_across_acts = [Mat]: length is no of features
    # Mat is n*n (n=length of features)

    feature_across_acts = []
    for i in range(feature_no):
        Mat = []
        for act1, mat1 in feature.items():
            mat = []
            for act2, mat2 in feature.items():
                ft_1 = mat1[:, 0]
                ft_2 = mat2[:, 0]
                corrf = corrolate(ft_1, ft_2, corr=corr)
                mat.append(corrf)
            Mat.append(mat)
        feature_across_acts.append(np.array(Mat))

    feature_across_feature = []
    # mat is the sliding window series of features under specific act
    for act, mat in feature.items():
        M = []
        for i in range(feature_no):
            m = []
            for j in range(feature_no):
                ft_1 = mat[:, i]
                ft_2 = mat[:, j]
                corrf = corrolate(ft_1, ft_2, corr=corr)
                m.append(corrf)
            M.append(m)
        feature_across_feature.append(np.array(M))

    # print(feature_across_feature[0])
    n = [i for i in range(feature_no)]

    # plot the heatmap of any individual feature
    if cross == 'feature':
        heat = feature_across_feature[0]
    else:
        heat = feature_across_acts[0]
    label = n_class if (heat is feature_across_acts[0]) else n
    sns.set_theme(style="white")
    mask = np.zeros_like(heat)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(heat, annot=True, xticklabels=label, yticklabels=label, mask=mask)
Пример #42
0
def pipeline_for_inter_instance(logger, analysis_dir, data, by, gt):
    logger.info("Analysing results for {}".format(analysis_dir))

    data_as_df = [d.validation_acc_dataframe(by) for d in data]
    os.makedirs(analysis_dir, exist_ok=True)
    subgraphs = data[0].columns
    for d in data:
        assert d.columns == subgraphs

    final_acc = np.zeros((len(data), len(subgraphs)))
    for i, df in enumerate(data_as_df):
        final_acc[i] = df.iloc[-1]

    # Consecutive tau (multi)
    lineplot([
        np.array(list(zip(df.index[1:], get_consecutive_rank_tau(df))))
        for df in data_as_df
    ],
             filepath=os.path.join(analysis_dir, "taus_consecutive_epochs"))

    # Final acc distribution
    boxplot(final_acc, filepath=os.path.join(analysis_dir, "final_acc"))

    # Final rank distribution
    final_rank = np.stack([rankdata_greater(row) for row in final_acc])
    boxplot(final_rank,
            filepath=os.path.join(analysis_dir, "final_rank_boxplot"),
            inverse_y=True)

    # GT-Tau
    gt_tau = np.array(
        [stats.kendalltau(row, gt[subgraphs])[0] for row in final_acc])
    np.savetxt(os.path.join(analysis_dir, "inst_gt_tau.txt"), gt_tau)
    report_mean_std_max_min(analysis_dir, logger, "GT-Tau", gt_tau)

    # Tau every epoch
    tau_data = [get_tau_along_epochs(df, gt, subgraphs) for df in data_as_df]
    tau_data_mean_over_instances = np.mean(np.stack(tau_data, axis=0), axis=0)
    report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window",
                            np.concatenate(tau_data))
    tau_curves = [
        stack_with_index(df.index, tau_d)
        for df, tau_d in zip(data_as_df, tau_data)
    ]
    lineplot(tau_curves,
             filepath=os.path.join(analysis_dir, "tau_curve_along_epochs"))
    for k in (10, 64):
        tau_data_clip = [t[-k:] for t in tau_data]
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-In-Window-Last-{}-Mean".format(k),
                                np.array([np.mean(t) for t in tau_data_clip]))
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-In-Window-Last-{}-Std".format(k),
                                np.array([np.std(t) for t in tau_data_clip]))
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-In-Window-Last-{}-Max".format(k),
                                np.array([np.max(t) for t in tau_data_clip]))
        report_mean_std_max_min(analysis_dir, logger,
                                "GT-Tau-In-Window-Last-{}-Min".format(k),
                                np.array([np.min(t) for t in tau_data_clip]))

        acc_data = [np.mean(df.iloc[-k:].values, axis=0) for df in data_as_df]
        report_mean_std_max_min(analysis_dir, logger,
                                "Acc-Mean-In-Window-Last-{}-Mean".format(k),
                                np.array([np.mean(x) for x in acc_data]))
        report_mean_std_max_min(analysis_dir, logger,
                                "Acc-Mean-In-Window-Last-{}-Std".format(k),
                                np.array([np.std(x) for x in acc_data]))

    # S-Tau (last 5 epochs)
    s_tau = np.zeros((min(map(lambda d: len(d),
                              data_as_df)), len(data), len(data)))
    for k in range(len(s_tau)):
        for i, table1 in enumerate(data_as_df):
            for j, table2 in enumerate(data_as_df):
                s_tau[k][i][j], _ = stats.kendalltau(table1.iloc[k],
                                                     table2.iloc[k])
    np.savetxt(os.path.join(analysis_dir, "inter_inst_s_tau.txt"), s_tau[-1])
    heatmap(s_tau[0],
            filepath=os.path.join(analysis_dir,
                                  "inter_inst_last_s_tau_heatmap"),
            figsize=(10, 10))
    if len(data) > 1:
        upper = np.triu_indices_from(s_tau[0], k=1)
        report_mean_std_max_min(analysis_dir, logger, "S-Tau-Last",
                                s_tau[-1][upper])
        s_tau_mean = np.mean(s_tau[:, upper[0], upper[1]], axis=1)
        s_tau_std = np.std(s_tau[:, upper[0], upper[1]], axis=1)
        report_mean_std_max_min(analysis_dir, logger, "S-Tau-Min",
                                s_tau[np.argmin(s_tau_mean)][upper])
        s_tau_errorbar = np.stack(
            [np.arange(len(s_tau)), s_tau_mean, s_tau_std], axis=1)
        errorbar([s_tau_errorbar],
                 filepath=os.path.join(analysis_dir, "inter_inst_s_tau_curve"))

        # S-Tau (without variance)
        lineplot([s_tau_errorbar[:, :2]],
                 fmt=["-o"],
                 filepath=os.path.join(
                     analysis_dir,
                     "inter_inst_s_tau_curve_along_epochs_without_var"))

        # Compare with GT-Tau
        lineplot(tau_curves + [s_tau_errorbar],
                 fmt=["-"] * len(tau_curves) + [":"],
                 filepath=os.path.join(
                     analysis_dir, "tau_curve_along_epochs_compare_to_s_tau"))

        lineplot([
            np.stack([
                np.arange(len(tau_data_mean_over_instances)),
                tau_data_mean_over_instances
            ],
                     axis=1)
        ] + [s_tau_errorbar],
                 fmt=["-", ":"],
                 filepath=os.path.join(
                     analysis_dir,
                     "tau_curve_along_epochs_mean_compare_to_s_tau"))

    # Final rank dist (sorted by GT)
    gt_rank = sorted(np.arange(len(subgraphs)),
                     key=lambda i: gt[subgraphs[i]],
                     reverse=True)
    final_rank_resorted = final_rank[:, gt_rank]
    boxplot(final_rank_resorted,
            filepath=os.path.join(analysis_dir,
                                  "final_rank_boxplot_sorted_gt"),
            inverse_y=True)

    # Tau sorted
    ref_gt_acc_taus = []
    for df, raw in zip(data_as_df, data):
        _, ref_gt_acc_tau = get_tau_along_epochs_combining_best_groups(
            df, gt, raw.grouping_numpy, raw.groups, subgraphs)
        ref_gt_acc_taus.append(
            stack_with_index(np.arange(len(ref_gt_acc_tau)), ref_gt_acc_tau))
    lineplot(ref_gt_acc_taus,
             filepath=os.path.join(
                 analysis_dir, "tau_curves_sorted_combining_different_epochs"))

    # Top-K-Rank
    top_acc, top_rank = get_top_k_acc_rank(final_acc, gt)
    topk = (1, 3)
    for k in topk:
        report_mean_std_max_min(analysis_dir, logger, "Top-{}-Acc".format(k),
                                top_acc[:, k - 1])
        report_mean_std_max_min(analysis_dir, logger, "Top-{}-Rank".format(k),
                                top_rank[:, k - 1])
    plot_top_k_variance_chart(os.path.join(analysis_dir, "inst_top_k"),
                              np.arange(len(top_acc)), top_acc, top_rank, gt,
                              topk)

    # Average final acc
    avg_acc = np.mean(final_acc, axis=0)
    np.savetxt(os.path.join(analysis_dir, "average_final_acc.txt"), avg_acc)
    std_acc = np.std(final_acc, axis=0)
    np.savetxt(os.path.join(analysis_dir, "std_final_acc.txt"), std_acc)
Пример #43
0
    def FeatureAnalysis(self, issave=True, isfigshow=False):
        """Feature analysis such as illustrations of the monitored costs, correlations and comparison of system and meter measurements."""

        print('monitored cost dataset analysis...')
        label_fontsize = FIGURE_LBL_FONTSIZE_MAIN

        dataset = self.dataset.copy()
        dataset.sort_values(by=['Number of Houses', 'Number of Processes'],
                            ascending=True,
                            inplace=True)

        dataset["ProcTime (" + r'$\Delta$' + ")" +
                units['ProcTime_FromMeter']] = dataset[
                    'ProcTime_FromMeter'] - dataset['ProcTime_FromSystem']
        dataset["ProcTime (" + r'$\Delta$' +
                ") (%)"] = 100 * dataset["ProcTime (" + r'$\Delta$' + ")" +
                                         units['ProcTime_FromMeter']].divide(
                                             dataset["ProcTime_FromMeter"],
                                             axis="index").copy()

        if "ProcTime_FromSystem" in self.targets:
            self.targets.remove("ProcTime_FromSystem")

        # Correlation analysis
        label_vars = self.features[:]
        label_vars.extend(self.targets)
        train_data = dataset[label_vars]
        cm = train_data.corr()
        mask = np.zeros_like(cm, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

        fig, ax = plt.subplots(figsize=(6, 6))
        fmt = '.2f'
        label_vars = [
            var + ' (target)' if var in self.targets else var + ' (feature)'
            for var in label_vars
        ]
        annot_kws = {"size": label_fontsize + 2, "ha": 'center', "va": 'top'}
        sns.heatmap(cm,
                    annot=True,
                    annot_kws=annot_kws,
                    fmt=fmt,
                    square=True,
                    cmap='coolwarm',
                    mask=mask,
                    ax=ax,
                    linewidths=0.1)
        plt.xticks(rotation=60, fontsize=label_fontsize)
        plt.yticks(rotation=0, fontsize=label_fontsize)
        ax.set_xticklabels(label_vars, fontsize=label_fontsize)
        ax.set_yticklabels(label_vars, fontsize=label_fontsize)
        ax.set_xticks(np.arange(0, len(label_vars), 1))
        ax.set_yticks(np.arange(0.5, len(label_vars), 1))
        ax.tick_params(axis='both', which='major', labelsize=label_fontsize)
        filename_plot = "{}/correlation_analysis".format(self.figure_path)
        util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave)

        # %% Feature analysis
        dataset['Number of Houses'] = dataset['Number of Houses'].astype('int')
        dataset['Number of Processes'] = dataset['Number of Processes'].astype(
            'int')

        y_vars = [
            "ProcTime (" + r'$\Delta$' + ")" + units['ProcTime_FromMeter'],
            "ProcTime (" + r'$\Delta$' + ") (%)"
        ]
        x_var = 'Number of Processes'
        # print(dataset[y_vars + [x_var]].groupby([x_var]).describe())
        for i, y_var in enumerate(y_vars):
            fig, ax = plt.subplots()
            fig = sns.catplot(x=x_var,
                              y=y_var,
                              kind="box",
                              data=dataset,
                              **kws_box_2)
            plt.xlabel(x_var, fontsize=label_fontsize)
            plt.ylabel(y_var, fontsize=label_fontsize)
            y_var2 = y_var
            if '%' in y_var:
                y_var2 = y_var.replace("(%)", "_perc")
            y_var2 = y_var2.replace("$", "").replace("\\", "").replace(
                "(", "").replace(")", "")

            plt.tick_params(axis='both',
                            which='major',
                            labelsize=label_fontsize)
            X, Y = dataset[x_var].values, dataset[y_var].values
            util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6)
            filename_plot = "{}/{}_vs_{}".format(self.figure_path, x_var,
                                                 y_var2)
            util.SaveFigure(filename_plot,
                            fig,
                            isshow=isfigshow,
                            issave=issave)

        y_vars = self.targets[:]
        x_var = 'Number of Processes'
        hue = 'Number of Houses'
        for i, y_var in enumerate(y_vars):
            fig, ax = plt.subplots()
            fig = sns.catplot(kind="point",
                              x=x_var,
                              y=y_var,
                              data=dataset,
                              hue=hue,
                              linestyles=linestyles,
                              markers=filled_markers,
                              markersize=10,
                              legend=False,
                              **kws_online_2)
            # plt.legend(fontsize=label_fontsize - 1, frameon=True, framealpha=0.5, title=hue, ncol=2, loc='upper right', bbox_to_anchor=(1, 0.99))
            plt.xlabel(x_var, fontsize=label_fontsize)
            plt.ylabel(y_var + units[y_var], fontsize=label_fontsize)
            plt.tick_params(axis='both',
                            which='major',
                            labelsize=label_fontsize)
            X, Y = dataset[x_var].values, dataset[y_var].values
            util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6)
            filename_plot = "{}/{}_vs_{}".format(self.figure_path, x_var,
                                                 y_var)
            util.SaveFigure(filename_plot,
                            fig,
                            isshow=isfigshow,
                            issave=issave)

            dataset[y_var + "_puh"] = dataset[y_var].divide(
                dataset["Number of Houses"], axis="index").copy()
            fig = sns.catplot(kind="point",
                              x=x_var,
                              y=y_var + "_puh",
                              data=dataset,
                              hue=hue,
                              linestyles=linestyles,
                              markers=filled_markers,
                              markersize=10,
                              legend=False,
                              **kws_online_2)
            plt.xlabel(x_var, fontsize=label_fontsize)
            plt.ylabel(y_var + units[y_var], fontsize=label_fontsize)
            plt.tick_params(axis='both',
                            which='major',
                            labelsize=label_fontsize)
            plt.legend(fontsize=label_fontsize - 3,
                       frameon=True,
                       framealpha=0.5,
                       title=hue,
                       ncol=2,
                       loc='upper right')
            X, Y = dataset[x_var].values, dataset[y_var + "_puh"].values
            util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6)
            filename_plot = "{}/{}_vs_{}_puh".format(self.figure_path, x_var,
                                                     y_var)
            util.SaveFigure(filename_plot,
                            fig,
                            isshow=isfigshow,
                            issave=issave)

        # print(dataset[[y_var +"_puh" for y_var in y_vars] + [x_var]].groupby([x_var]).describe())
        y_vars = [
            'house_energy_mean_kwh', "house_energy_median_kwh",
            'house_energy_max_kwh'
        ]
        x_var = 'Number of Processes'
        hue = 'Number of Houses'
        for i, y_var in enumerate(y_vars):
            fig = sns.catplot(kind="point",
                              x=x_var,
                              y=y_var,
                              data=dataset,
                              hue=hue,
                              linestyles=linestyles,
                              markers=filled_markers,
                              markersize=10,
                              legend=False,
                              **kws_online_2)
            plt.xlabel(x_var, fontsize=label_fontsize)
            plt.ylabel(y_var, fontsize=label_fontsize)
            plt.tick_params(axis='both',
                            which='major',
                            labelsize=label_fontsize)
            plt.legend(fontsize=label_fontsize - 3,
                       frameon=True,
                       framealpha=0.5,
                       title=hue,
                       ncol=4,
                       bbox_to_anchor=(1., 1 + 0.21 * 2))
            X, Y = dataset[x_var].values, dataset[y_var].values
            util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6)
            filename_plot = "{}/{}_vs_{}".format(self.figure_path, x_var,
                                                 y_var)
            util.SaveFigure(filename_plot,
                            fig,
                            isshow=isfigshow,
                            issave=issave)
        # print(dataset[y_vars + [x_var]].groupby([x_var]).describe())

        y_vars = ['house_energy_total_kwh']
        x_var = 'Number of Processes'
        hue = 'Number of Houses'
        for i, y_var in enumerate(y_vars):
            dataset[y_var + "_puh"] = dataset[y_var].divide(
                dataset["Number of Houses"], axis="index").copy()
            fig = sns.catplot(kind="point",
                              x=x_var,
                              y=y_var + "_puh",
                              data=dataset,
                              hue=hue,
                              linestyles=linestyles,
                              markers=filled_markers,
                              markersize=10,
                              legend=False,
                              **kws_online_2)
            plt.xlabel(x_var, fontsize=label_fontsize)
            plt.ylabel(y_var, fontsize=label_fontsize)
            plt.tick_params(axis='both',
                            which='major',
                            labelsize=label_fontsize)
            plt.legend(fontsize=label_fontsize - 3,
                       frameon=True,
                       framealpha=0.5,
                       title=hue,
                       ncol=4,
                       bbox_to_anchor=(1., 1 + 0.21 * 2))
            X, Y = dataset[x_var].values, dataset[y_var + "_puh"].values
            util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6)
            filename_plot = "{}/{}_vs_{}_puh".format(self.figure_path, x_var,
                                                     y_var)
            util.SaveFigure(filename_plot,
                            fig,
                            isshow=isfigshow,
                            issave=issave)
Пример #44
0
def confounds_correlation_plot(confounds_file,
                               output_file=None,
                               figure=None,
                               reference='global_signal',
                               max_dim=70):
    """
    Parameters
    ----------
    confounds_file: str
        File containing all confound regressors to be included in the
        correlation plot.
    output_file: str or None
        Path where the output figure should be saved. If this is not defined,
        then the plotting axes will be returned instead of the saved figure
        path.
    figure: figure or None
        Existing figure on which to plot.
    reference: str
        `confounds_correlation_plot` prepares a bar plot of the correlations
        of each confound regressor with a reference column. By default, this
        is the global signal (so that collinearities with the global signal
        can readily be assessed).
    max_dim: int
        The maximum number of regressors to be included in the output plot.
        Reductions (e.g., CompCor) of high-dimensional data can yield so many
        regressors that the correlation structure becomes obfuscated. This
        criterion selects the `max_dim` regressors that have the largest
        correlation magnitude with `reference` for inclusion in the plot.

    Returns
    -------
    axes and gridspec
        Plotting axes and gridspec. Returned only if `output_file` is None.
    output_file: str
        The file where the figure is saved.
    """
    confounds_data = pd.read_table(confounds_file)
    confounds_data = confounds_data.loc[:,
                                        np.logical_not(
                                            np.isclose(
                                                confounds_data.var(
                                                    skipna=True), 0))]
    corr = confounds_data.corr()
    np.fill_diagonal(corr.values, 0)

    gscorr = corr.copy()
    gscorr['index'] = gscorr.index
    gscorr[reference] = np.abs(gscorr[reference])
    gs_descending = gscorr.sort_values(by=reference, ascending=False)['index']

    if corr.shape[0] > max_dim:
        gs_descending = gs_descending[:max_dim]
        features = [p for p in corr.columns if p in gs_descending]
        corr = corr.loc[features, features]
    n_vars = corr.shape[0]

    if figure is None:
        plt.figure(figsize=(3 * n_vars * 0.3, n_vars * 0.3))
    gs = mgs.GridSpec(1, 15)
    ax0 = plt.subplot(gs[0, :7])
    ax1 = plt.subplot(gs[0, 7:])

    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr,
                linewidths=0.5,
                cmap='coolwarm',
                center=0,
                square=True,
                ax=ax0)
    ax0.tick_params(axis='both', which='both', width=0)

    for tick in ax0.xaxis.get_major_ticks():
        tick.label.set_fontsize('small')
    for tick in ax0.yaxis.get_major_ticks():
        tick.label.set_fontsize('small')
    sns.barplot(data=gscorr,
                x='index',
                y=reference,
                ax=ax1,
                order=gs_descending,
                palette='Reds_d',
                saturation=.5)

    ax1.set_xlabel('Confound time series')
    ax1.set_ylabel('Magnitude of correlation with {}'.format(reference))
    ax1.tick_params(axis='x', which='both', width=0)
    ax1.tick_params(axis='y', which='both', width=5, length=5)

    for tick in ax1.xaxis.get_major_ticks():
        tick.label.set_fontsize('small')
        tick.label.set_rotation('vertical')
    for tick in ax1.yaxis.get_major_ticks():
        tick.label.set_fontsize('small')
    for side in ['top', 'right', 'left']:
        ax1.spines[side].set_color('none')
        ax1.spines[side].set_visible(False)

    if output_file is not None:
        figure = plt.gcf()
        figure.savefig(output_file, bbox_inches='tight')
        plt.close(figure)
        figure = None
        return output_file
    return [ax0, ax1], gs
Пример #45
0
def mustache(c, chromosome, chromosome2, res, start, end, mask_size,
             distance_in_px, octave_values, st, pt):

    nz = np.logical_and(c != 0, np.triu(c, 4))
    if np.sum(nz) < 50:
        return []
    c[np.tril_indices_from(c, 4)] = 2

    if chromosome == chromosome2:
        c[np.triu_indices_from(c, k=(distance_in_px + 1))] = 2

    pAll = np.ones_like(c[nz]) * 2
    Scales = np.ones_like(pAll)
    vAll = np.zeros_like(pAll)
    s = 10
    #curr_filter = 1
    scales = {}
    for o in octave_values:
        scales[o] = {}
        sigma = o
        w = 2 * math.ceil(2 * sigma) + 1
        t = (((w - 1) / 2) - 0.5) / sigma
        Gp = gaussian_filter(c, o, truncate=t, order=0)
        scales[o][1] = sigma

        sigma = o * 2**((2 - 1) / s)
        w = 2 * math.ceil(2 * sigma) + 1
        t = (((w - 1) / 2) - 0.5) / sigma
        Gc = gaussian_filter(c, sigma, truncate=t, order=0)
        scales[o][2] = sigma

        Lp = Gp - Gc
        Gp = []

        sigma = o * 2**((3 - 1) / s)
        w = 2 * math.ceil(2 * sigma) + 1
        t = (((w - 1) / 2) - 0.5) / sigma
        Gn = gaussian_filter(c, sigma, truncate=t, order=0)
        scales[o][3] = sigma

        #Lp = Gp - Gc
        Lc = Gc - Gn

        locMaxP = maximum_filter(Lp,
                                 footprint=np.ones((3, 3)),
                                 mode='constant')
        locMaxC = maximum_filter(Lc,
                                 footprint=np.ones((3, 3)),
                                 mode='constant')
        for i in range(3, s + 2):
            #curr_filter += 1
            Gc = Gn

            sigma = o * 2**((i) / s)
            w = 2 * math.ceil(2 * sigma) + 1
            t = ((w - 1) / 2 - 0.5) / sigma
            Gn = gaussian_filter(c, sigma, truncate=t, order=0)
            scales[o][i + 1] = sigma

            Ln = Gc - Gn
            dist_params = expon.fit(np.abs(Lc[nz]))
            pval = 1 - expon.cdf(np.abs(Lc[nz]), *dist_params)
            locMaxN = maximum_filter(Ln,
                                     footprint=np.ones((3, 3)),
                                     mode='constant')

            willUpdate = np.logical_and \
                .reduce((Lc[nz] > vAll, Lc[nz] == locMaxC[nz],
                         np.logical_or(Lp[nz] == locMaxP[nz],
                                       Ln[nz] == locMaxN[nz]),
                         Lc[nz] > locMaxP[nz],
                         Lc[nz] > locMaxN[nz]))
            vAll[willUpdate] = Lc[nz][willUpdate]
            Scales[willUpdate] = scales[o][i]
            pAll[willUpdate] = pval[willUpdate]
            Lp = Lc
            Lc = Ln
            locMaxP = locMaxC
            locMaxC = locMaxN

    pFound = pAll != 2
    if len(pFound) < 10000:
        return []
    _, pCorrect, _, _ = multipletests(pAll[pFound], method='fdr_bh')
    pAll[pFound] = pCorrect

    o = np.ones_like(c)
    o[nz] = pAll
    sig_count = np.sum(o < pt)  #change
    x, y = np.unravel_index(np.argsort(o.ravel()), o.shape)
    so = np.ones_like(c)
    so[nz] = Scales

    x = x[:sig_count]
    y = y[:sig_count]
    xyScales = so[x, y]

    nonsparse = x != 0
    for i in range(len(xyScales)):
        s = math.ceil(xyScales[i])
        c1 = np.sum(nz[x[i]-s:x[i]+s+1, y[i]-s:y[i]+s+1]) / \
            ((2*s+1)**2)
        s = 2 * s
        c2 = np.sum(nz[x[i]-s:x[i]+s+1, y[i]-s:y[i]+s+1]) / \
            ((2*s+1)**2)
        if c1 < st or c2 < 0.6:
            nonsparse[i] = False
    x = x[nonsparse]
    y = y[nonsparse]

    if len(x) == 0:
        return []

    def nz_mean(vals):
        return np.mean(vals[vals != 0])

    def diag_mean(k, map):
        return nz_mean(map[kth_diag_indices(map, k)])

    if chromosome == chromosome2:
        means = np.vectorize(diag_mean, excluded=['map'])(k=y - x, map=c)
        passing_indices = c[x, y] > 2 * means  #change
        if len(passing_indices) == 0 or np.sum(passing_indices) == 0:
            return []
        x = x[passing_indices]
        y = y[passing_indices]

    label_matrix = np.zeros((np.max(y) + 2, np.max(y) + 2), dtype=np.float32)
    label_matrix[x, y] = o[x, y] + 1
    label_matrix[x + 1, y] = 2
    label_matrix[x + 1, y + 1] = 2
    label_matrix[x, y + 1] = 2
    label_matrix[x - 1, y] = 2
    label_matrix[x - 1, y - 1] = 2
    label_matrix[x, y - 1] = 2
    label_matrix[x + 1, y - 1] = 2
    label_matrix[x - 1, y + 1] = 2
    num_features = scipy_measurements.label(label_matrix,
                                            output=label_matrix,
                                            structure=np.ones((3, 3)))

    out = []
    for label in range(1, num_features + 1):
        indices = np.argwhere(label_matrix == label)
        i = np.argmin(o[indices[:, 0], indices[:, 1]])
        _x, _y = indices[i, 0], indices[i, 1]
        out.append([_x + start, _y + start, o[_x, _y], so[_x, _y]])

    return out
Пример #46
0
    def __init__(
            self,
            num_environment,
            num_agents,
            num_managers,
            innoise,
            outnoise,
            fanout,
            envnoise,
            envobsnoise,  #statedim,
            batchsize,
            optimizer,
            env_input,
            env_pattern_input=None,
            agent_type="sigmoid",
            agent_order='linear',
            network_type=None,
            network_prespecified_input=None,
            network_update_method=None,
            dropout_rate=0.0,
            dropout_type='AllIn',
            L1_norm=.0,
            weight_on_cost=0.,
            weight_update=False,
            initializer_type='zeros',
            dunbar_number=2,
            dunbar_function='linear_kth',
            randomSeed=False,
            decay=None,
            tensorboard_filename=None,
            **kwargs):

        self.sess = tf.Session()

        #For Debug
        self.task_loss_list = []
        self.cost_loss_list = []
        self.total_loss_list = []

        self.num_environment = num_environment
        self.num_agents = num_agents
        if num_managers is "AllButOne":
            self.num_managers = num_agents - 1
        else:
            self.num_managers = num_managers

        self.agent_order = agent_order

        self.batchsize = batchsize
        self.envobsnoise = envobsnoise
        self.agents = []
        for i in range(num_agents):
            self.agents.append(
                Agent(innoise,
                      outnoise,
                      i,
                      fanout,
                      batchsize,
                      num_agents,
                      num_environment,
                      dunbar_number,
                      initializer_type=initializer_type))  #, statedim

        self.env_input = env_input
        self.env_pattern_input = env_pattern_input
        with tf.name_scope("Environment"):
            if env_pattern_input is None:
                self.environment = tf.random_normal(
                    [self.batchsize, num_environment],
                    mean=0.0,
                    stddev=1.0,
                    dtype=tf.float32)
                zero = tf.convert_to_tensor(0.0, tf.float32)
                greater = tf.greater(self.environment,
                                     zero,
                                     name="Organization_greater")
                self.environment = tf.where(greater,
                                            tf.ones_like(self.environment),
                                            tf.zeros_like(self.environment),
                                            name="where_env")
            else:
                self.environment = tf.placeholder(
                    tf.float32, shape=[self.batchsize, self.num_environment])
                self.env_pattern = tf.placeholder(tf.float32,
                                                  shape=[self.batchsize, 1])
        with tf.name_scope('Network_prespecified'):
            #num_environment+num_agents times num_agents matrix of binary
            #Includes environment, but not bias
            self.network_prespecified = tf.placeholder(
                tf.float32,
                shape=[
                    self.num_environment + self.num_agents, self.num_agents
                ])
            if network_prespecified_input is None:  #all the edges are possible
                temp = np.zeros(
                    [self.num_environment + self.num_agents, self.num_agents])
                temp[np.triu_indices_from(temp, k=-self.num_environment)] = 1.
                self.network_prespecified_input = temp
                #self.network_prespecified_input = np.ones([self.num_environment+self.num_agents,self.num_agents])
            else:
                self.network_prespecified_input = network_prespecified_input

        self.network_update_method = network_update_method

        self.dropout_rate = dropout_rate
        self.dropout_type = dropout_type

        self.L1_norm = L1_norm

        if weight_update is False:
            self.weight_on_cost = tf.constant(
                weight_on_cost, dtype=tf.float32
            )  #the weight on the listening cost in loss function
            self.weight_on_cost_val = weight_on_cost
        elif weight_update is True:
            self.weight_on_cost = tf.get_variable(name="weight_on_cost",
                                                  dtype=tf.float32,
                                                  initializer=tf.constant(
                                                      weight_on_cost,
                                                      dtype=tf.float32),
                                                  trainable=False)
            self.weight_on_cost_val = weight_on_cost
            self.assign_weight = tf.assign(self.weight_on_cost,
                                           self.weight_on_cost_val)
        self.weight_update = weight_update
        self.dunbar_number = dunbar_number  #Dunbar number
        self.dunbar_function = dunbar_function

        self.build_org()

        with tf.name_scope("Objective"):
            self.objective_task = self._make_loss_task()
            self.objective_cost = self._make_loss_cost()
            self.objective_L1 = self._make_loss_L1()
            # self.objective = self.loss()
            self.objective = self.weight_on_cost * self.objective_cost + (
                1 -
                self.weight_on_cost) * self.objective_task + self.objective_L1

        with tf.name_scope("Optimizer"):
            self.learning_rate = tf.placeholder(tf.float32)
            #self.optimize =tf.train.AdadeltaOptimizer(self.learning_rate, rho=.9).minimize(self.objective)
            self.optimize = tf.train.AdamOptimizer(
                self.learning_rate).minimize(self.objective)
            #self.optimize =tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.objective)
            self.start_learning_rate = .1  #15.
            self.decay = decay  #None #.01

        if (tensorboard_filename == None):
            self.writer = None
        else:
            self.writer = tf.summary.FileWriter(tensorboard_filename,
                                                self.sess.graph)
        self.saver = tf.train.Saver()

        merged = tf.summary.merge_all()
        init = tf.global_variables_initializer()
        self.sess.run(init)
data_df.plot(kind='scatter', x='CRIM', y='PRICE', ax=axs[0], figsize=(16, 8))
data_df.plot(kind='scatter', x='LSTAT', y='PRICE', ax=axs[1])
data_df.plot(kind='scatter', x='AGE', y='PRICE', ax=axs[2])

# Correlation measures the strength of the linear relationship between two independent variables

sns.set(style="white")

# Compute the correlation matrix
corr = data_df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(
    corr, dtype=np.bool
)  # Return an array of zeros with the same shape and type as a given array
mask[np.triu_indices_from(
    mask)] = True  # Return the indices for the upper-triangle of arr

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,
            mask=mask,
            cmap=cmap,
            vmax=.3,
            center=0,
            square=True,
            linewidths=.5,
Пример #48
0
def scores(key, paths, config):
    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["prob_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits))]
    auc_splits = [roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits))]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1,alternative = 'greater')
    pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1,alternative = 'greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5,alternative = 'greater')
    pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5,alternative = 'greater')
    pvalue_bacc = binom_test(success[0]+success[1], s[0] + s[1], p=0.5,alternative = 'greater')


    # Beta's measures of similarity
    betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T
    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
                array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
                for i in range(betas.shape[0])])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0]) for j in range(i+1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores['pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores['pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()


    return scores
Пример #49
0
def GammaGamma_Connectome_thresholding_pFDR(input_file, toolbox_path):

    #Add the toolbox to path
    #toolbox_path = "/Users/alblle/allera_version_controlled_code/One_Dim_Mixture_Models/python/code"
    sys.path.append(os.path.join(os.path.abspath(toolbox_path)))
    from Mixture_Model_1Dim import Mixture_Model_1Dim

    #load input conenctivity matrix
    #input_file="/Users/alblle/Dropbox/POSTDOC/Demetrius/dmn_non_normalized.csv"
    #connectivity_matrix = np.loadtxt(input_file, delimiter=',')#, skiprows=1,skipcolumns=1)
    connectivity_matrix = np.genfromtxt(input_file, delimiter=',')

    #get updiagonal terms
    updiag_idx = np.triu_indices_from(connectivity_matrix, k=1)
    orig_data_vector = connectivity_matrix[updiag_idx]
    orig_data_vector = orig_data_vector[~np.isnan(orig_data_vector)]
    data_vector = orig_data_vector[orig_data_vector > 0.05]
    scaling_factor = np.mean(data_vector)
    data_vector = np.divide(data_vector, scaling_factor)

    #Define options for the mixture model fit
    Inference = 'Variational Bayes'  #'Method of moments' OR 'Maximum Likelihood' OR 'Variational Bayes' ML NOT INCLUDED YET
    Number_of_Components = 2
    Components_Model = [
        'Gamma', 'InvGamma'
    ]  #,'-Gamma'] #Each component can be Gauss, Gamma, InvGamma, -Gamma, -InvGamma
    maxits = 500
    tol = 0.00001
    good_model = 0
    percentiles = np.array([99, 98.5, 98, 97.5, 97, 96.5, 96, 95.5, 95])
    percentile_idx = -1
    while good_model == 0:
        percentile_idx = percentile_idx + 1
        tail = np.percentile(data_vector, percentiles[percentile_idx])
        init_params = [1, 2, tail, 2]  #,-5,2]
        opts = {
            'Inference': Inference,
            'Number_of_Components': Number_of_Components,
            'Components_Model': Components_Model,
            'init_params': init_params,
            'maxits': maxits,
            'tol': tol
        }
        #Define options for the mixture model fit
        # CALL TO FIT MIXTURE MODEL
        Model = Mixture_Model_1Dim(data_vector, opts)
        #if Model['Mixing Prop.'][0]<.95:
        good_model = 1
        # CALL TO FIT MIXTURE MODEL

    if 1:
        # Plot the resulting fit on a histogram of the data
        from alb_MM_functions import gam
        my_range = np.linspace(0.01, np.max(data_vector), 10000)
        plt1 = np.multiply(
            Model['Mixing Prop.'][0],
            gam(my_range, Model['shapes'][0], np.divide(1, Model['rates'][0])))
        plt2 = np.multiply(
            Model['Mixing Prop.'][1],
            gam(my_range, Model['shapes'][1], np.divide(1, Model['rates'][1])))

        import matplotlib.pyplot as plt
        plt.hist(data_vector, bins=50, density=True, alpha=1, color='g')
        plt.plot(my_range, plt1, 'k', linewidth=2)
        plt.plot(my_range, plt2, 'k', linewidth=2)
        plt.plot(my_range, plt1 + plt2, 'r', linewidth=2)
        plt.show()
        # Plot the resulting fit on a histogram of the data

    #Compute local FDR
    p0 = Model['Mixing Prop.'][0]
    #f0(x)=gam(x,Model['shapes'][0],np.divide(1,Model['rates'][0])))
    rho = data_vector.shape[0]
    sorted_data_vector = -np.sort(-data_vector)
    all_localFDR = np.ones(rho)
    flag = 0
    k = -1
    while flag == 0:
        k = k + 1
        point = sorted_data_vector[k]
        cdf = scipy.stats.gamma.cdf(point, Model['shapes'][0], 0,
                                    np.divide(1., Model['rates'][0]))
        numerator = np.multiply(float(p0), 1 - cdf)
        denominator = np.divide(float(k + 1), float(rho))
        all_localFDR[k] = np.divide(numerator, denominator)
        pFDR = all_localFDR[k]
        if pFDR > 0.05:
            threshold = np.multiply(sorted_data_vector[k - 1], scaling_factor)
            flag = 1
            print threshold

    return threshold, Model
Пример #50
0
def takeAverageList():
    result = []
    for i in range(channels):
        result.append(corr.values[np.triu_indices_from(corr.values, i)].mean())
        # print(str(i + 1) + ' ' + str(corr.values[np.triu_indices_from(corr.values, i)].mean()))
    return result
Пример #51
0
def run_ttests(narps, logfile, overwrite=True):
    masker = nilearn.input_data.NiftiMasker(mask_img=narps.dirs.MNI_mask)
    results_dir = narps.dirs.dirs['consensus']

    func_name = sys._getframe().f_code.co_name
    log_to_file(logfile, '%s' % func_name)

    if not os.path.exists(results_dir):
        os.mkdir(results_dir)

    for hyp in hypnums:
        if not overwrite and os.path.exists(
                os.path.join(results_dir, 'hypo%d_1-fdr.nii.gz' % hyp)):
            print('using existing results')
            continue
        print('running consensus analysis for hypothesis', hyp)
        maps = glob.glob(
            os.path.join(narps.dirs.dirs['output'],
                         'zstat/*/hypo%d_unthresh.nii.gz' % hyp))
        maps.sort()
        data = masker.fit_transform(maps)

        # get estimated mean, variance, and correlation for t_corr
        img_mean = numpy.mean(data)
        img_var = numpy.mean(numpy.var(data, 1))
        cc = numpy.corrcoef(data)
        log_to_file(
            logfile, 'mean = %f, var = %f, mean_cc = %f' %
            (img_mean, img_var, numpy.mean(cc[numpy.triu_indices_from(cc,
                                                                      1)])))

        # perform t-test
        tvals, pvals = t_corr(data, res_mean=img_mean, res_var=img_var, Q=cc)

        # move back into image format
        timg = masker.inverse_transform(tvals)
        timg.to_filename(os.path.join(results_dir, 'hypo%d_t.nii.gz' % hyp))
        pimg = masker.inverse_transform(1 - pvals)
        pimg.to_filename(os.path.join(results_dir, 'hypo%d_1-p.nii.gz' % hyp))
        fdr_results = multipletests(pvals[0, :], 0.05, 'fdr_tsbh')
        log_to_file(
            logfile, "%d voxels significant at FDR corrected p<.05" %
            numpy.sum(fdr_results[0]))
        fdrimg = masker.inverse_transform(1 - fdr_results[1])
        fdrimg.to_filename(
            os.path.join(results_dir, 'hypo%d_1-fdr.nii.gz' % hyp))

        # compute tau^2 per Tom's notes in CorrelatedMetaNotes.html
        def tau(data, Q):
            n = data.shape[0]
            R = numpy.eye(n) - numpy.ones((n, 1)).dot(numpy.ones((1, n))) / n
            sampvar_est = numpy.trace(R.dot(Q))
            tau2 = numpy.zeros(data.shape[1])
            for i in range(data.shape[1]):
                Y = data[:, i]
                tau2[i] = (1 / sampvar_est) * Y.T.dot(R).dot(Y)
            return (numpy.sqrt(tau2))

        tau_est = tau(data, cc)
        tauimg = masker.inverse_transform(tau_est)
        tauimg.to_filename(os.path.join(results_dir,
                                        'hypo%d_tau.nii.gz' % hyp))
Пример #52
0
def fill_off_diagonal(x, radius, value=0):
    """Sets all cells of a matrix to a given ``value``
    if they lie outside a constraint region.
    In this case, the constraint region is the
    Sakoe-Chiba band which runs with a fixed ``radius``
    along the main diagonal.
    When ``x.shape[0] != x.shape[1]``, the radius will be
    expanded so that ``x[-1, -1] = 1`` always.

    ``x`` will be modified in place.

    Parameters
    ----------
    x : np.ndarray [shape=(N, M)]
        Input matrix, will be modified in place.
    radius : float
        The band radius (1/2 of the width) will be
        ``int(radius*min(x.shape))``.
    value : int
        ``x[n, m] = value`` when ``(n, m)`` lies outside the band.

    Examples
    --------
    >>> x = np.ones((8, 8))
    >>> global_constraints(x, 0.25)
    >>> x
    array([[1, 1, 0, 0, 0, 0, 0, 0],
           [1, 1, 1, 0, 0, 0, 0, 0],
           [0, 1, 1, 1, 0, 0, 0, 0],
           [0, 0, 1, 1, 1, 0, 0, 0],
           [0, 0, 0, 1, 1, 1, 0, 0],
           [0, 0, 0, 0, 1, 1, 1, 0],
           [0, 0, 0, 0, 0, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 1, 1]])
    >>> x = np.ones((8, 12))
    >>> global_constraints(x, 0.25)
    >>> x
    array([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
           [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
           [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
           [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0],
           [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
           [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])
    """
    nx, ny = x.shape

    # Calculate the radius in indices, rather than proportion
    radius = np.round(radius * np.min(x.shape))

    nx, ny = x.shape
    offset = np.abs((x.shape[0] - x.shape[1]))

    if nx < ny:
        idx_u = np.triu_indices_from(x, k=radius + offset)
        idx_l = np.tril_indices_from(x, k=-radius)
    else:
        idx_u = np.triu_indices_from(x, k=radius)
        idx_l = np.tril_indices_from(x, k=-radius - offset)

    # modify input matrix
    x[idx_u] = value
    x[idx_l] = value
Пример #53
0
def corner_beautiful_plot(data,bestfit,split,bins=50,labels=None,interpolation='nearest',cmap=plt.cm.gray,show=True):
    """
    Create a croner plot from a pandas dataframe input.
    ___
    INPUT:

    data:           pandas dataframe with N columns
    bestfit:        best fit values to mark in the corner plot as  horizontal and vertical lines
                    with N-values in a numpy array
    split:          boolean array with N-elements. This set if you want only the decimal part of
                    the values. Because this plot is thinking in exoplanetary transits, this was
                    created to remove the int part of the Julian Date
    bins:           the number of bins that you whant to show at the 2D-histogram of the data at
                    the lower subplots of the corner plot. Default is 50 bins. 
    labels:         labels of each column of the corner plot. N-list of strings. Default is None,
                    and the code will use the names of the columns in the pandas dataframe input.
    interpolation:  string, interpolation of the 2D-histogram. Default is 'nearest'. 
                    Possible options are 'none', 'bilinear', 'bicubic', 'spline16', 'spline36', 
                    'hanning', 'hamming', 'hermite', 'kaiser', 'quadric', 'catrom', 'gaussian', 
                    'bessel', 'mitchell', 'sinc','lanczos'.
    cmap:           `~matplotlib.colors.Colormap`, optional, default: 'hot'. If None, cmap to rc 
                    `image.cmap` value. `cmap` is ignored when `X` has RGB(A) information.
    show:           boolean value: True or False. Default is True. If default, then the function
                    will show information for each step.


    """

    #Setting the plot default parameters:
    def init_plotting2():
        plt.rcParams['figure.figsize'] = (14.0,14.0)
        plt.rcParams['font.size'] = 14
        #plt.rcParams['font.family'] = 'Times New Roman'
        plt.rcParams['axes.labelsize'] = plt.rcParams['font.size']
        plt.rcParams['axes.titlesize'] = 2*plt.rcParams['font.size']
        plt.rcParams['legend.fontsize'] = 0.65*plt.rcParams['font.size']
        plt.rcParams['xtick.labelsize'] = plt.rcParams['font.size']
        plt.rcParams['ytick.labelsize'] = plt.rcParams['font.size']
        plt.rcParams['xtick.major.size'] = 3
        plt.rcParams['xtick.minor.size'] = 3
        plt.rcParams['xtick.major.width'] = 1
        plt.rcParams['xtick.minor.width'] = 1
        plt.rcParams['ytick.major.size'] = 3
        plt.rcParams['ytick.minor.size'] = 3
        plt.rcParams['ytick.major.width'] = 1
        plt.rcParams['ytick.minor.width'] = 1
        plt.rcParams['legend.frameon'] = True
        plt.rcParams['legend.loc'] = 'best'
        plt.rcParams['axes.linewidth'] = 1
    
    init_plotting2() #initiallizing the plot parameters
    
    #import the shape of the dataframe input to a variable to use
    #for obtain the data and set the size of the corner plot
    shape = data.shape #[0]rows, [1]columns 
    
    #If split have some value equal to True, than the exactly column
    #will have the int part removed.
    for k in range(shape[1]):
        if split[k] == True:
            data.iloc[:,k] = np.modf(data.iloc[:,k].values)[0]
    
    #creating the corner plot structure:
    #f: figure output
    #axarr: array-plot, matrix plot with the size of the columns of dataframe per
    #columns of the dataframe N X N plots
    f, axarr = plt.subplots(shape[1], shape[1])#, sharex=True)#, sharey=True)
    
    #if show == True, this routine will show the information for each step before starting
    #the porcedure to create the plot i X j.
    if show == True:
        print('Creating corner plot, shape = ',shape[1],' per ',shape[1])

    #Creating the plot i X j
    for i in range(shape[1]):
        for j in range(shape[1]):
            #information about what type of plot will be created
            #If i ==j, then will be created a histogram of the i-column at the 
            #pandas dataframe input.
            #If i !=j, then will be created a numpy.histogram2d variable, and with that
            #we will use the output as a image to be treated as plt.imshow function
            if show == True:
                print('Subplot = ',i,j)
                if i == j:
                    print('Histogram of ',i)
                else:
                    print('Density plot of ',i,' per ',j)
            # i == j, Diagonal plots: Hitograms plots from i-columns at dataframe input.
            if i == j:
                #remove histogram grid.
                axarr[i][j].grid() 
                #create the numpy.histogram variable
                #normed=True to match with the plt.imshow image
                #bins setting to be at the sqrt-scale, sqrt(len(i-column)), to show
                #appropriated scale of the data set. 
                H,bins_edges = np.histogram(data.iloc[:,i].values,normed=True,bins='sqrt')
                #plot the histogram at same conditions of the numpy.histogram H variable
                axarr[i][j].hist(data.iloc[:,i].values,normed=True,bins='sqrt')
                #plot a vertical line of the bestfit i-value to zero to maximum 
                #of the H-variable and set the x-limits and y-limits 
                axarr[i][j].vlines(bestfit[i],0,H.max(),color='red')
                axarr[i][j].set_ylim(0,H.max())
                axarr[i][j].set_xlim(data.iloc[:,i].min(),data.iloc[:,i].max())
                #setting the text with the value of i-bestfit rounded with 4-decimal numbers
                #color setting match with the vertical line
                axarr[i][j].text(bestfit[i],H.mean(),str(round(bestfit[i],4)),color='red')
                #remove y-axis to clean the corner plot with no-needed information
                axarr[i][j].get_yaxis().set_visible(False)
                #remove x-ticks if the plots are not max(j) per max(j).
                #Those plots represent histograms that are above others plots
                #Else, the x-ticks will be created, with the name of the column in the pandas
                #dataframe input or if labels are given, those will be used. 
                if labels == None:
                    if i == int(shape[1]-1):
                        axarr[i][j].set_xlabel(data.columns[i])
                else:
                    if i == int(shape[1]-1):
                        axarr[i][j].set_xlabel(labels[i])
                if i != int(shape[1]-1):
                    axarr[i][j].get_xaxis().set_visible(False)
                if i == int(shape[1]-1):
                    axarr[i][j].get_xaxis().set_visible(True)
                    axarr[i][j].locator_params(axis='x',nbins=6)
            #i != j plots: image plot from plt.imshow
            #We will map the numpy.histogram2d from the j-column (x-axis) per i-column (yaxis)
            #The change of j to x-axis and i to y-axis is because to mach the x-label of the
            #histograms in the diagonal plots with the images plots. So, this will set 
            #correctly the corner axis.
            else:
                #Create the numpy.histogram2d object (image) H from the data set j-column per
                #i-column, and set the x- and y-tiks labels range xedges and yedges, respectively.
                #We will use the number of bins given from the input parameters. The default is
                #bins = 50. 
                H, xedges, yedges = np.histogram2d(data.iloc[:,j].values,data.iloc[:,i].values,bins=bins)
                #Setting the minimum  and maximum  of each x- and y-ticks, rounded with 4-decimal.
                xmin, xmax = round(xedges.min(),4),round(xedges.max(),4)
                ymin, ymax = round(yedges.min(),4),round(yedges.max(),4)
                #remove the grid of the image.
                axarr[i][j].grid(b=False)
                #plotting the image H with cmap and interpolation given by input parameters.
                #the minimum and maximum of the map is set to be the difference between
                #the mean value of H with plus or the difference with standart deviation of H.
                #The aspect is set to be auto to adjust the box of the image with the x- and
                #y-ticks.
                axarr[i][j].imshow(H,cmap=cmap,origin='lower',
                                   extent=[xmin,xmax,ymin,ymax],aspect='auto',
                                   vmin=np.mean(H)-np.std(H),vmax=np.mean(H)+np.std(H),
                                   interpolation=interpolation)
                #Make the countour plot of H.
                axarr[i][j].contour(H,origin='lower',extent=[xmin,xmax,ymin,ymax])
                #Setting the box limits to match with the histograms plots at the diagonal 
                axarr[i][j].set_xlim(xmin,xmax)
                axarr[i][j].set_ylim(ymin,ymax)
                #plot the vertical and horizontal lines with the values of the bestfit i and j
                #values, with red color.
                axarr[i][j].hlines(bestfit[i],xmin,xmax,color='red')
                axarr[i][j].vlines(bestfit[j],ymin,ymax,color='red')
                #set the aspect of the box x- and y-ticks to auto too to match with the 
                #plt.imshow parameter.
                axarr[i][j].set_aspect('auto')
                #Fix the number of values at the array to clean the x- and y-ticks.
                #here, I choose to make only 6 bins, and this will give 5-ticks at maximum
                axarr[i][j].locator_params(axis='x',nbins=6)
                axarr[i][j].locator_params(axis='y',nbins=6)
                #Set the labels of each plot. If the labels are given as one of the input
                #parameters, these will be used instead of the names of the pandas 
                #dataframe input 
                if labels == None:
                    axarr[i][j].set_ylabel(data.columns[i])
                    axarr[i][j].set_xlabel(data.columns[j])
                else:
                    axarr[i][j].set_ylabel(labels[i])
                    axarr[i][j].set_xlabel(labels[j])
                #If the plot is between two other plots,
                #the y-ticks or the x-ticks will be removed.
                if (j > 0):
                    axarr[i][j].get_yaxis().set_visible(False)
                if (i < shape[1]-1):
                    axarr[i][j].get_xaxis().set_visible(False)
    #Remove the upper-plots of the diagonal plots
    #this create a triangle plot 
    for i, j in zip(*np.triu_indices_from(axarr, 1)): 
        axarr[i, j].set_visible(False)
    return f
Пример #54
0
def heatmap(df,
            inline=True,
            filter=None,
            n=0,
            p=0,
            sort=None,
            figsize=(20, 12),
            fontsize=16,
            labels=True,
            cmap='RdBu'):
    """
    Presents a `seaborn` heatmap visualization of nullity correlation in the given DataFrame.
    
    Note that this visualization has no special support for large datasets. For those, try the dendrogram instead.
    

    :param df: The DataFrame whose completeness is being heatmapped.
    :param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default). See
    `nullity_filter()` for more information.
    :param n: The cap on the number of columns to include in the filtered DataFrame. See  `nullity_filter()` for
    more information.
    :param p: The cap on the percentage fill of the columns in the filtered DataFrame. See  `nullity_filter()` for
    more information.
    :param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None. See
    `nullity_sort()` for more information.
    :param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to (20, 12).
    :param fontsize: The figure's font size.
    :param labels: Whether or not to label each matrix entry with its correlation (default is True).
    :param cmap: What `matplotlib` colormap to use. Defaults to `RdBu`.
    :return: Returns the underlying `matplotlib.figure` object.
    """
    # Apply filters and sorts.
    df = nullity_filter(df, filter=filter, n=n, p=p)
    df = nullity_sort(df, sort=sort)

    # Set up the figure.
    fig = plt.figure(figsize=figsize)
    gs = gridspec.GridSpec(1, 1)
    ax0 = plt.subplot(gs[0])

    # Pre-processing: remove completely filled or completely empty variables.
    df = df[[
        i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0
    ]]

    # Create and mask the correlation matrix.
    corr_mat = df.isnull().corr()
    # corr_mat = corr_mat.replace(np.nan, 1)
    # corr_mat[np.isnan(corr_mat)] = 0
    mask = np.zeros_like(corr_mat)
    mask[np.triu_indices_from(mask)] = True

    # Set fontsize.
    # fontsize = _set_font_size(fig, df, fontsize)

    # Construct the base heatmap.
    if labels:
        sns.heatmap(corr_mat,
                    mask=mask,
                    cmap=cmap,
                    ax=ax0,
                    cbar=False,
                    annot=True,
                    annot_kws={"size": fontsize - 2})
    else:
        sns.heatmap(corr_mat, mask=mask, cmap=cmap, ax=ax0, cbar=False)

    # Apply visual corrections and modifications.
    ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
                        rotation=45,
                        ha='left',
                        fontsize=fontsize)
    ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
                        fontsize=fontsize,
                        rotation=0)
    ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
                        rotation=0,
                        fontsize=fontsize)

    ax0.xaxis.tick_top()
    ax0.patch.set_visible(False)

    # Fix up annotation label rendering.
    for text in ax0.texts:
        t = float(text.get_text())
        if 0.95 <= t < 1:
            text.set_text("<1")
        elif -1 < t <= -0.95:
            text.set_text(">-1")
        elif t == 1:
            text.set_text("1")
        elif t == -1:
            text.set_text("-1")
        elif -0.05 < t < 0.05:
            text.set_text("")
        else:
            text.set_text(round(t, 1))

    if inline:
        plt.show()
    else:
        return fig
Пример #55
0
def scatterplot_matrix(data, names, min_max_range, originaldf, **kwargs):
    numvars, numdata = data.shape
    #data = data.as_matrix()
    #data = data.astype(str)
    fig, axes = plt.subplots(nrows=len(names),
                             ncols=len(names),
                             figsize=(25, 25),
                             sharex=False,
                             sharey=False)
    fig.subplots_adjust(hspace=0.1, wspace=0.1)
    cmap = cm.get_cmap('Paired')
    norm = plt.Normalize()
    professorsArray = np.array(originaldf['professor_cat'])
    marker = np.array(originaldf['professor_cat'])
    face = []
    for i in range(1, len(professorsArray)):
        marker[i] = marker[i] * 20
        face.append(i)
    colors = cmap(norm(professorsArray))
    lines = []
    for ax in axes.flat:
        # Hide all ticks and labels
        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        elif ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        else:
            ax.yaxis.set_visible(False)
            #ax.xaxis.set_visible(False)
        if ax.is_first_row():
            #ax.yaxis.set_visible(False)
            ax.xaxis.set_ticks_position('top')
        elif ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')
        else:
            ax.xaxis.set_visible(False)

    # Plot the data.
    k = 0
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i, j), (j, i)]:
            rowval = names[y]
            colval = names[x]
            x_min_val, x_max_val, x_val_range = min_max_range[rowval]
            y_min_val, y_max_val, y_val_range = min_max_range[colval]
            x_arr = data[rowval]
            y_arr = data[colval]
            x_ticks = 5
            y_ticks = 5
            x_step = x_val_range / float(x_ticks - 1)
            y_step = y_val_range / float(y_ticks - 1)
            x_tick_labels = [
                round(x_min_val + x_step * i, 2) for i in range(x_ticks)
            ]
            y_tick_labels = [
                round(y_min_val + y_step * i, 2) for i in range(y_ticks)
            ]
            x_norm_min = data[rowval].min()
            y_norm_min = data[colval].min()
            x_norm_range = np.ptp(data[rowval])
            y_norm_range = np.ptp(data[colval])
            x_norm_step = x_norm_range / float(x_ticks - 1)
            y_norm_step = y_norm_range / float(y_ticks - 1)
            x_ticks = [
                round(x_norm_min + x_norm_step * i, 2) for i in range(x_ticks)
            ]
            y_ticks = [
                round(y_norm_min + y_norm_step * i, 2) for i in range(y_ticks)
            ]
            if rowval == 'professor_cat':
                j = 0
                for val in x_tick_labels:
                    val = int(val)
                    professors = np.unique(originaldf['professor'])
                    x_tick_labels[j] = professors[val]
                    j = j + 1
            if colval == 'professor_cat':
                j = 0
                for val in y_tick_labels:
                    val = int(val)
                    professors = np.unique(originaldf['professor'])
                    y_tick_labels[j] = professors[val]
                    j = j + 1
            if rowval == 'lecture_cat':
                j = 0
                for val in x_tick_labels:
                    val = int(val)
                    lectures = np.unique(originaldf['lecture'])
                    x_tick_labels[j] = lectures[val]
                    j = j + 1
            if colval == 'lecture_cat':
                j = 0
                for val in y_tick_labels:
                    val = int(val)
                    lectures = np.unique(originaldf['lecture'])
                    y_tick_labels[j] = lectures[val]
                    j = j + 1
            axes[x, y].yaxis.set_ticks(y_ticks)
            axes[x, y].xaxis.set_ticks(x_ticks)
            axes[x, y].set_xticklabels(x_tick_labels, rotation=75)
            axes[x, y].set_yticklabels(y_tick_labels)

            axes[x, y].scatter(x_arr,
                               y_arr,
                               c=colors,
                               label=np.array(originaldf['professor']))
            k = k + 1

            #plt.setp(axes[x, y].get_xticklabels(), rotation=30, horizontalalignment='right')

        # Label the diagonal subplots...
    for i, label in enumerate(names):
        if label == 'professor_cat':
            label = 'professor'
        if label == 'lecture_cat':
            label = 'lecture'
        axes[i, i].annotate(label, (0.5, 0.5),
                            xycoords='axes fraction',
                            ha='center',
                            va='center')
        axes[i, i].xaxis.set_visible(False)
        axes[i, i].yaxis.set_visible(False)
        # Turn on the proper x or y axes ticks.
    #handles, labels = axes[x, y].data.values(),axes[x, y].data.values()
    #fig.legend(colors,np.unique(professors))
    plt.savefig('scattermatrix.png', dpi=72)
    plt.show()
Пример #56
0
def ripser(X,
           maxdim=1,
           thresh=np.inf,
           coeff=2,
           metric="euclidean",
           metric_params={},
           weights=None,
           weight_params=None,
           collapse_edges=False,
           n_perm=None):
    """Compute persistence diagrams for X data array using Ripser [1]_.

    If X is not a distance matrix, it will be converted to a distance matrix
    using the chosen metric.

    Parameters
    ----------
    X : ndarray of shape (n_samples, n_features)
        A numpy array of either data or distance matrix. Can also be a sparse
        distance matrix of type scipy.sparse

    maxdim : int, optional, default: ``1``
        Maximum homology dimension computed. Will compute all dimensions lower
        than and equal to this value. For 1, H_0 and H_1 will be computed.

    thresh : float, optional, default: ``numpy.inf``
        Maximum distances considered when constructing filtration. If
        ``numpy.inf``, compute the entire filtration.

    coeff : int prime, optional, default: ``2``
        Compute homology with coefficients in the prime field Z/pZ for p=coeff.

    metric : string or callable, optional, default: ``'euclidean'``
        The metric to use when calculating distance between instances in a
        feature array. If set to ``'precomputed'``, input data is interpreted
        as a distance matrix or of adjacency matrices of a weighted undirected
        graph. If a string, it must be one of the options allowed by
        :func:`scipy.spatial.distance.pdist` for its metric parameter, or a
        or a metric listed in
        :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, including
        ``'euclidean'``, ``'manhattan'`` or ``'cosine'``. If a callable, it
        should take pairs of vectors (1D arrays) as input and, for each two
        vectors in a pair, it should return a scalar indicating the
        distance/dissimilarity between them.

    metric_params : dict, optional, default: ``{}``
        Additional parameters to be passed to the distance function.

    weights : ``"DTM"``, ndarray or None, optional, default: ``None``
        If not ``None``, the persistence of a weighted Vietoris-Rips filtration
        is computed as described in [3]_, and this parameter determines the
        vertex weights in the modified adjacency matrix. ``"DTM"`` denotes the
        empirical distance-to-measure function defined, following [3]_, by

        .. math:: w(x) = 2\\left\\(\\frac{1}{n+1} \\sum_{k=1}^n
           \\mathrm{dist}(x, x_k)^r \\right)^{1/r}.

        Here, :math:`\\mathrm{dist}` is the distance metric used, :math:`x_k`
        is the :math:`k`-th :math:`\\mathrm{dist}`-nearest neighbour of
        :math:`x` (:math:`x` is not considered a neighbour of itself),
        :math:`n` is the number of nearest neighbors to include, and :math:`r`
        is a parameter (see `weight_params`). If an ndarray is passed, it is
        interpreted as a user-defined list of vertex weights for the modified
        adjacency matrix. In either case, the edge weights
        :math:`\\{w_{ij}\\}_{i, j}` for the modified adjacency matrix are
        computed from the original distances and the new vertex weights
        :math:`\\{w_i\\}_i` as follows:

        .. math:: w_{ij} = \\begin{cases} \\max\\{ w_i, w_j \\}
           &\\text{if } 2\\mathrm{dist}_{ij} \\leq
           |w_i^p - w_j^p|^{\\frac{1}{p}} \\
           t &\\text{otherwise} \\end{cases}

        where :math:`t` is the only positive root of

        .. math:: 2 \\mathrm{dist}_{ij} = (t^p - w_i^p)^\\frac{1}{p} +
           (t^p - w_j^p)^\\frac{1}{p}

        and :math:`p` is a parameter specified in `metric_params`.

    weight_params : dict or None, optional, default: ``None``
        Parameters to be used in the case of weighted filtrations, see
        `weights`. In this case, the key ``"p"`` determines the power to be
        used in computing edge weights from vertex weights. It can be one of
        ``1``, ``2`` or ``np.inf`` and defaults to ``1``. If `weights` is
        ``"DTM"``, the additional keys ``"r"`` (default: ``2``) and
        ``"n_neighbors"`` (default: ``3``) are available (see `weights`,
        where the latter corresponds to :math:`n`).


    collapse_edges : bool, optional, default: ``False``
        Whether to use the edge collapse algorithm as described in [2]_ prior
        to calling ``ripser``.

    n_perm : int or None, optional, default: ``None``
        The number of points to subsample in a "greedy permutation", or a
        furthest point sampling of the points. These points will be used in
        lieu of the full point cloud for a faster computation, at the expense
        of some accuracy, which can be bounded as a maximum bottleneck distance
        to all diagrams on the original point set.

    Returns
    -------
    A dictionary holding all of the results of the computation
    {
        'dgms': list (size maxdim) of ndarray (n_pairs, 2)
            A list of persistence diagrams, one for each dimension less
            than maxdim. Each diagram is an ndarray of size (n_pairs, 2)
            with the first column representing the birth time and the
            second column representing the death time of each pair.
        'num_edges': int
            The number of edges added during the computation
        'dperm2all': None or ndarray (n_perm, n_samples)
            ``None`` if n_perm is ``None``. Otherwise, the distance from all
            points in the permutation to all points in the dataset.
        'idx_perm': ndarray(n_perm) if n_perm > 0
            Index into the original point cloud of the points used
            as a subsample in the greedy permutation
        'r_cover': float
            Covering radius of the subsampled points.
            If n_perm <= 0, then the full point cloud was used and this is 0
    }

    Notes
    -----
    `Ripser <https://github.com/Ripser/ripser>`_ is used as a C++ backend
    for computing Vietoris–Rips persistent homology. Python bindings were
    modified for performance from the `ripser.py
    <https://github.com/scikit-tda/ripser.py>`_ package.

    `GUDHI <https://github.com/GUDHI/gudhi-devel>`_ is used as a C++ backend
    for the edge collapse algorithm described in [2]_.

    References
    ----------
    .. [1] U. Bauer, "Ripser: efficient computation of Vietoris–Rips
           persistence barcodes", 2019; `arXiv:1908.02518
           <https://arxiv.org/abs/1908.02518>`_.

    .. [2] J.-D. Boissonnat and S. Pritam, "Edge Collapse and Persistence of
           Flag Complexes"; in *36th International Symposium on Computational
           Geometry (SoCG 2020)*, pp. 19:1–19:15, Schloss
           Dagstuhl-Leibniz–Zentrum für Informatik, 2020;
           `DOI: 10.4230/LIPIcs.SoCG.2020.19
           <https://doi.org/10.4230/LIPIcs.SoCG.2020.19>`_.

    .. [3] H. Anai et al, "DTM-Based Filtrations"; in *Topological Data
           Analysis* (Abel Symposia, vol 15), Springer, 2020;
           `DOI: 10.1007/978-3-030-43408-3_2
           <https://doi.org/10.1007/978-3-030-43408-3_2>`_.

    """
    if n_perm and issparse(X):
        raise Exception("Greedy permutation is not supported for sparse "
                        "distance matrices")
    if n_perm and n_perm > X.shape[0]:
        raise Exception("Number of points in greedy permutation is greater "
                        "than number of points in the point cloud")
    if n_perm and n_perm < 0:
        raise Exception("There should be a strictly positive number of points "
                        "in the greedy permutation")

    idx_perm = np.arange(X.shape[0])
    r_cover = 0.0
    if n_perm:
        idx_perm, lambdas, dperm2all = \
            get_greedy_perm(X, n_perm=n_perm, metric=metric)
        r_cover = lambdas[-1]
        dm = dperm2all[:, idx_perm]
    else:
        if metric == 'precomputed':
            dm = X
        else:
            dm = pairwise_distances(X, metric=metric, **metric_params)
        dperm2all = None

    n_points = max(dm.shape)

    use_sparse_computer = True
    if issparse(dm):
        row, col, data = _resolve_symmetry_conflicts(dm.tocoo())  # Upper diag

        if weights is not None:
            if (dm < 0).nnz:
                raise ValueError("Distance matrix has negative entries. "
                                 "Weighted Rips filtration unavailable.")

            weight_params = {} if weight_params is None else weight_params
            weights_p = weight_params.get("p", 1)

            # Restrict to off-diagonal entries for weights computation since
            # diagonal ones are given by `weights`. Explicitly set the diagonal
            # to 0 -- this is also important for DTM since otherwise
            # kneighbors_graph with include_self=False skips the first true
            # neighbor.
            off_diag = row != col
            row, col, data = (np.hstack([row[off_diag],
                                         np.arange(n_points)]),
                              np.hstack([col[off_diag],
                                         np.arange(n_points)]),
                              np.hstack([data[off_diag],
                                         np.zeros(n_points)]))

            if isinstance(weights, str) and (weights == "DTM"):
                n_neighbors = weight_params.get("n_neighbors", 3)
                weights_r = weight_params.get("r", 2)

                # CSR matrix must be symmetric for kneighbors_graph to give
                # correct results
                dm = csr_matrix((np.hstack([data, data[:-n_points]]),
                                 (np.hstack([row, col[:-n_points]]),
                                  np.hstack([col, row[:-n_points]]))))
                weights = _compute_dtm_weights(dm, n_neighbors, weights_r)
            else:
                weights = _check_weights(weights, n_points)

            data = _weight_filtration_sparse(row, col, data, weights,
                                             weights_p)

        if collapse_edges:
            row, col, data = _collapse_coo(row, col, data, thresh)

    else:
        if weights is not None:
            if (dm < 0).any():
                raise ValueError("Distance matrix has negative entries. "
                                 "Weighted Rips filtration unavailable.")

            weight_params = {} if weight_params is None else weight_params
            weights_p = weight_params.get("p", 1)

            if isinstance(weights, str) and (weights == "DTM"):
                n_neighbors = weight_params.get("n_neighbors", 3)
                weights_r = weight_params.get("r", 2)

                if not np.array_equal(dm, dm.T):
                    dm = np.triu(dm, k=1)
                    dm += dm.T

                weights = _compute_dtm_weights(dm, n_neighbors, weights_r)
            else:
                weights = _check_weights(weights, n_points)

            dm = _weight_filtration_dense(dm, weights, weights_p)
            np.fill_diagonal(dm, weights)

        if (dm.diagonal() != 0).any():
            # Convert to sparse format, because currently that's the only
            # one handling nonzero births
            (row, col) = np.triu_indices_from(dm)
            data = dm[(row, col)]
            if collapse_edges:
                row, col, data = _collapse_coo(row, col, data, thresh)
        elif collapse_edges:
            row, col, data = gtda_collapser.\
                flag_complex_collapse_edges_dense(dm, thresh)
        else:
            use_sparse_computer = False

    if use_sparse_computer:
        res = DRFDMSparse(np.asarray(row, dtype=np.int32, order="C"),
                          np.asarray(col, dtype=np.int32, order="C"),
                          np.asarray(data, dtype=np.float32, order="C"),
                          n_points, maxdim, thresh, coeff)
    else:
        # Only consider strict upper diagonal
        DParam = squareform(dm, checks=False).astype(np.float32)
        # Run garbage collector to free up memory taken by `dm`
        del dm
        gc.collect()
        res = DRFDM(DParam, maxdim, thresh, coeff)

    # Unwrap persistence diagrams
    dgms = res.births_and_deaths_by_dim
    for dim in range(len(dgms)):
        N = int(len(dgms[dim]) / 2)
        dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2])

    ret = {
        "dgms": dgms,
        "num_edges": res.num_edges,
        "dperm2all": dperm2all,
        "idx_perm": idx_perm,
        "r_cover": r_cover
    }

    return ret
Пример #57
0
    count += 1

# plotting the correlation matrix
#http://glowingpython.blogspot.com.es/2012/10/visualizing-correlation-matrices.html
R = corrcoef(transpose(semana))
pcolor(R)
colorbar()
yticks(arange(0, 20), range(0, 20))
xticks(arange(0, 20), range(0, 20))
show()

# http://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
# Generate a mask for the upper triangle
sns.set(style="white")
mask = np.zeros_like(R, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(200, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(R,
            mask=mask,
            cmap=cmap,
            vmax=.8,
            square=True,
            xticklabels=2,
            yticklabels=2,
fig = plt.figure(figsize=(10, 9))
data[data.columns[1:24]].corrwith(
    data['default.payment.next.month']).plot.barh(fontsize=20,
                                                  rot=0,
                                                  grid=True)
plt.title("Correlation of Explanatory variables with the targe feature",
          fontsize=20,
          fontweight='bold')
plt.show()

# In[296]:

correlations_exvar = data[data.columns[1:24]].corr()
plt.figure(figsize=(20, 15))
mask1 = np.zeros_like(correlations_exvar, dtype=np.bool)
mask1[np.triu_indices_from(mask1)] = True
cmap = 'Dark2'  # sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(correlations_exvar,
            cmap=cmap,
            mask=mask1,
            annot=True,
            square=True,
            vmax=.3,
            center=0,
            linewidths=.5,
            cbar_kws={"shrink": 0.7})
plt.title('The correlation among %d Explanatory Variables' %
          len(data[data.columns[1:24]].columns),
          fontsize=20,
          fontweight='bold')
plt.ylabel('The name of %d Explanatory Variable' %
Пример #59
0
def GaussGammas_Connectome_thresholding_pFDR(input_file, toolbox_path):

    #Add the toolbox to path
    sys.path.append(os.path.join(os.path.abspath(toolbox_path)))
    from Mixture_Model_1Dim import Mixture_Model_1Dim

    #load input conenctivity matrix
    #connectivity_matrix = np.loadtxt(input_file, delimiter=',')#, skiprows=1,skipcolumns=1)
    connectivity_matrix = np.genfromtxt(input_file, delimiter=',')

    #get updiagonal terms
    updiag_idx = np.triu_indices_from(connectivity_matrix, k=1)
    orig_data_vector = connectivity_matrix[updiag_idx]
    orig_data_vector = orig_data_vector[
        ~np.isnan(orig_data_vector
                  )]  #data_vector=orig_data_vector[orig_data_vector>0.05]

    #demean and divide for std to allow easy initialization
    mean_factor = np.mean(orig_data_vector)
    scaling_factor = 1.  #np.std(orig_data_vector)
    data_vector = np.divide(orig_data_vector - mean_factor, scaling_factor)

    #Define options for the mixture model fit
    Inference = 'Variational Bayes'  #'Method of moments'#'Variational Bayes' #'Variational Bayes'  #'Method of moments' OR 'Maximum Likelihood' OR 'Variational Bayes' ML NOT INCLUDED YET
    Number_of_Components = 3
    Components_Model = [
        'Gauss', 'InvGamma', '-InvGamma'
    ]  #,'-Gamma'] #Each component can be Gauss, Gamma, InvGamma, -Gamma, -InvGamma
    maxits = 500
    tol = 0.00001
    init_params = [0, 1, 6, 2, -6, 2]
    init_params = [
        0, 1,
        np.percentile(data_vector, 99), 2,
        np.percentile(data_vector, 1), 2
    ]
    opts = {
        'Inference': Inference,
        'Number_of_Components': Number_of_Components,
        'Components_Model': Components_Model,
        'init_params': init_params,
        'maxits': maxits,
        'tol': tol
    }
    # CALL TO FIT MIXTURE MODEL
    Model = Mixture_Model_1Dim(data_vector, opts)
    #if Model['Mixing Prop.'][0]<.95:
    #good_model=1

    # Visualizar fit
    visualize_model_fit = 1

    if visualize_model_fit == 1:

        my_range = np.linspace(-10, 10, 10000)

        plt0 = np.multiply(
            Model['Mixing Prop.'][0],
            norm.pdf(my_range, Model['mu1'][0],
                     np.sqrt(np.divide(1, Model['taus1'][0]))))
        #plt0=np.multiply( Model['Mixing Prop.'][0],norm.pdf(my_range,Model['mu1'][0],np.sqrt(Model['taus1'][0])  ) )
        #plt0=np.multiply( Model['Mixing Prop.'][0],norm.pdf(my_range,Model['mu1'][0],Model['taus1'][0])  )

        if Components_Model[1] == 'InvGamma':
            plt1 = np.multiply(
                Model['Mixing Prop.'][1],
                invgam(my_range, Model['shapes'][1], Model['scales'][1]))
        elif Components_Model[1] == 'Gamma':
            plt1 = np.multiply(
                Model['Mixing Prop.'][1],
                gam(my_range, Model['shapes'][1],
                    np.divide(1, Model['rates'][1])))

        plt1[my_range < 0] = 0

        if Components_Model[2] == '-InvGamma':
            plt2 = np.multiply(
                Model['Mixing Prop.'][2],
                invgam(-my_range, Model['shapes'][2], Model['scales'][2]))
        elif Components_Model[2] == '-Gamma':
            plt2 = np.multiply(
                Model['Mixing Prop.'][2],
                gam(-my_range, Model['shapes'][2],
                    np.divide(1, Model['rates'][2])))

        plt2[my_range > 0] = 0

        import matplotlib.pyplot as plt
        fig = plt.figure()
        #plt.plot(range(10))
        plt.hist(data_vector, bins=50, density=True, alpha=1, color='g')
        plt.plot(my_range, plt0, 'k', linewidth=2)
        plt.plot(my_range, plt1, 'k', linewidth=2)
        plt.plot(my_range, plt2, 'k', linewidth=2)
        plt.plot(my_range, plt0 + plt1 + plt2, 'r', linewidth=2)
        fig.savefig(os.path.expanduser('~/Desktop/temp.png'), dpi=fig.dpi)
        #plt.show()
        # Plot the resulting fit on a histogram of the data

    #Compute local FDR at positive and negative tail
    #f0(x)=gam(x,Model['shapes'][0],np.divide(1,Model['rates'][0])))
    p0 = Model['Mixing Prop.'][0]
    rho = data_vector.shape[0]

    #FDR at positive side
    sorted_data_vector = -np.sort(-data_vector)
    all_localFDR = np.ones(rho)
    flag = 0
    k = -1
    while flag == 0:
        k = k + 1
        point = sorted_data_vector[k]
        cdf = norm.cdf(point, Model['mu1'][0],
                       np.sqrt(np.divide(1, Model['taus1'][0])))
        numerator = np.multiply(float(p0), 1 - cdf)
        denominator = np.divide(float(k + 1), float(rho))
        all_localFDR[k] = np.divide(numerator, denominator)
        pFDR = all_localFDR[k]
        if pFDR > 0.001:
            if k == 0:
                threshold1 = sorted_data_vector[k]
            else:
                threshold1 = sorted_data_vector[k - 1]
                # np.multiply(sorted_data_vector[k-1],scaling_factor)

            flag = 1

            #print threshold1

    #FDR at negative side
    sorted_data_vector = -np.sort(data_vector)
    all_localFDR = np.ones(rho)
    flag = 0
    k = -1
    while flag == 0:
        k = k + 1
        point = sorted_data_vector[k]
        cdf = norm.cdf(-point, Model['mu1'][0],
                       np.sqrt(np.divide(1, Model['taus1'][0])))
        numerator = np.multiply(float(p0), 1 - cdf)
        denominator = np.divide(float(k + 1), float(rho))
        all_localFDR[k] = np.divide(numerator, denominator)
        pFDR = all_localFDR[k]
        if pFDR > 0.001:
            if k == 0:
                threshold2 = -sorted_data_vector[k]
            else:
                threshold2 = -sorted_data_vector[k - 1]
                # np.multiply(sorted_data_vector[k-1],scaling_factor)

            flag = 1

    #Rescale the thresholds using the data mean and std
    threshold1 = np.multiply(threshold1, scaling_factor) + mean_factor
    threshold2 = np.multiply(threshold2, scaling_factor) + mean_factor
    print threshold1
    print threshold2

    return threshold1, threshold2, Model
Пример #60
0
def scores(key, paths, config, as_dataframe=False):
    import mapreduce
    print(key)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.

    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_recall_mean = binom_test(success[0] + success[1],
                                    s[0] + s[1],
                                    p=0.5,
                                    alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
        #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
        #                   rtol=0, atol=1e-02))

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    scores = OrderedDict()
    scores['key'] = key
    try:
        a, l1, l2, tv = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar

    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores