def add_stars(ax, P_mat, tri=True): ''' Use the p matrix to add stars to the significant cells. If triangle is True then only put stars in the lower triangle, otherwise put them in all the cells ''' # Import what you need import numpy as np # Get the indices you need if tri: i_inds, j_inds = np.triu_indices_from(P_mat, k=0) else: i_inds, j_inds = np.triu_indices_from(P_mat, k=P_mat.shape[0]*-1) # Loop through all the measures and fill the arrays for i, j in zip(i_inds, j_inds): # Figure out the text you're going to put on the plot star = '' if 0.01 < P_mat[i,j] < 0.05: star = '*' elif 0.001 <= P_mat[i,j] < 0.01: star = '**' elif P_mat[i,j] < 0.001: star = '***' text = ax.text(i, j, star, horizontalalignment='center', verticalalignment='center', color = 'k') return ax
def _expected_kid_and_std(real_imgs, gen_imgs, max_block_size=1024): n_r, dim = real_imgs.shape n_g = gen_imgs.shape[0] n_blocks = int(np.ceil(max(n_r, n_g) / max_block_size)) sizes_r = np.full(n_blocks, n_r // n_blocks) to_patch = n_r - n_blocks * (n_r // n_blocks) if to_patch > 0: sizes_r[-to_patch:] += 1 inds_r = np.r_[0, np.cumsum(sizes_r)] assert inds_r[-1] == n_r sizes_g = np.full(n_blocks, n_g // n_blocks) to_patch = n_g - n_blocks * (n_g // n_blocks) if to_patch > 0: sizes_g[-to_patch:] += 1 inds_g = np.r_[0, np.cumsum(sizes_g)] assert inds_g[-1] == n_g ests = [] for i in range(n_blocks): r = real_imgs[inds_r[i]:inds_r[i + 1]] g = gen_imgs[inds_g[i]:inds_g[i + 1]] k_rr = (np.dot(r, r.T) / dim + 1)**3 k_rg = (np.dot(r, g.T) / dim + 1)**3 k_gg = (np.dot(g, g.T) / dim + 1)**3 ests.append(-2 * k_rg.mean() + k_rr[np.triu_indices_from(k_rr, k=1)].mean() + k_gg[np.triu_indices_from(k_gg, k=1)].mean()) var = np.var(ests, ddof=1) if len(ests) > 1 else np.nan return np.mean(ests), np.sqrt(var / len(ests))
def get_measurement_polynomials(self, noise = 0, seed = 0): np.random.seed(seed) k, d = self.k, self.d params = self.get_parameters() R = ring([x for x, _ in params], RR)[0] names = {str(x) : R(x) for x in R.symbols} xs = array([[names[self.x(i,j)] for j in xrange(k)] for i in xrange(d)]) params = [(names[x], v) for x, v in params] # Second order moments (TODO: 3rd order moments) P = zeros((d,d), dtype=np.object) p = zeros((d,), dtype=np.object) for i in xrange(d): p[i] = sum(xs[i,k_] for k_ in xrange(k))# / k for j in xrange(i, d): P[i,j] = sum(xs[i,k_] * xs[j,k_] for k_ in xrange(k))# / k # Project and profit m = zeros((d,)) M = zeros((d,d)) for i in xrange(d): m[i] = p[i].evaluate(params) for j in xrange(i, d): M[i,j] = P[i,j].evaluate(params) M = M + noise * np.random.randn(d,d) m = m + noise * np.random.randn(d) # TODO: Something is wrong here #m = M.sum(1) # Finally return values. return R, [f - f_ for f, f_ in zip(p.flatten(), m.flatten())] + [f - f_ for f, f_ in zip(P[triu_indices_from(P)], M[triu_indices_from(M)])]
def plot_clustering_similarity(results, plot_dir=None, verbose=False, ext='png'): HCA = results.HCA # get all clustering solutions clusterings = HCA.results.items() # plot cluster agreement across embedding spaces names = [k for k,v in clusterings] cluster_similarity = np.zeros((len(clusterings), len(clusterings))) cluster_similarity = pd.DataFrame(cluster_similarity, index=names, columns=names) distance_similarity = np.zeros((len(clusterings), len(clusterings))) distance_similarity = pd.DataFrame(distance_similarity, index=names, columns=names) for clustering1, clustering2 in combinations(clusterings, 2): name1 = clustering1[0].split('-')[-1] name2 = clustering2[0].split('-')[-1] # record similarity of distance_df dist_corr = np.corrcoef(squareform(clustering1[1]['distance_df']), squareform(clustering2[1]['distance_df']))[1,0] distance_similarity.loc[name1, name2] = dist_corr distance_similarity.loc[name2, name1] = dist_corr # record similarity of clustering of dendrogram clusters1 = clustering1[1]['labels'] clusters2 = clustering2[1]['labels'] rand_score = adjusted_rand_score(clusters1, clusters2) MI_score = adjusted_mutual_info_score(clusters1, clusters2) cluster_similarity.loc[name1, name2] = rand_score cluster_similarity.loc[name2, name1] = MI_score with sns.plotting_context(context='notebook', font_scale=1.4): clust_fig = plt.figure(figsize = (12,12)) sns.heatmap(cluster_similarity, square=True) plt.title('Cluster Similarity: TRIL: Adjusted MI, TRIU: Adjusted Rand', y=1.02) dist_fig = plt.figure(figsize = (12,12)) sns.heatmap(distance_similarity, square=True) plt.title('Distance Similarity, metric: %s' % HCA.dist_metric, y=1.02) if plot_dir is not None: save_figure(clust_fig, path.join(plot_dir, 'cluster_similarity_across_measures.%s' % ext), {'bbox_inches': 'tight'}) save_figure(dist_fig, path.join(plot_dir, 'distance_similarity_across_measures.%s' % ext), {'bbox_inches': 'tight'}) plt.close(clust_fig) plt.close(dist_fig) if verbose: # assess relationship between two measurements rand_scores = cluster_similarity.values[np.triu_indices_from(cluster_similarity, k=1)] MI_scores = cluster_similarity.T.values[np.triu_indices_from(cluster_similarity, k=1)] score_consistency = np.corrcoef(rand_scores, MI_scores)[0,1] print('Correlation between measures of cluster consistency: %.2f' \ % score_consistency)
def mat2vec(m,include_diag=False): # Hack to be compatible with matlab column-wise instead of row-wise if include_diag: inddown = np.triu_indices_from(m,0) else: inddown = np.triu_indices_from(m,1) inddown = (inddown[1], inddown[0]) return m[inddown]
def test_simple_hessenberg_trafo(): # Made up discrete time TF G = Transfer([1., -8., 28., -58., 67., -30.], poly([1, 2, 3., 2, 3., 4, 1 + 1j, 1 - 1j]), dt=0.1) H, _ = hessenberg_realization(G, compute_T=1, form='c', invert=1) assert_(not np.any(H.a[triu_indices_from(H.a, k=2)])) assert_(not np.any(H.b[:-1, 0])) H = hessenberg_realization(G, form='o', invert=1) assert_(not np.any(H.c[0, :-1])) assert_(not np.any(H.a.T[triu_indices_from(H.a, k=2)]))
def corr(chroma): """Chroma correlation coefficient fingerprints. After [1]. Args: chroma (2d-array): 2d-array containing the chroma features. Returns: list: 12 fingerprints (1d-array), one for each key. [1] Van Balen, J., Bountouridis, D., Wiering, F., & Veltkamp, R.C. (2014). Cognition-inspired Descriptors for Scalable Cover Song Retrieval. In Proc. International Society for Music Information Retrieval Conference. """ fp = np.corrcoef(chroma, rowvar=0) fp_12 = [np.roll(np.roll(fp, i, 0), i, 1) for i in range(12)] # flatten upper = np.triu_indices_from(fp, k=1) fp_12 = [fp[upper] for fp in fp_12] return fp_12
def lapointe_random_uniform_ultrametric(n, prng=None): """Generate a uniform random ultrametric over n points using the method of Lapointe.""" if prng is None: prng = _np.random.RandomState() fusion_levels = prng.uniform(0,1,n-1) ultrametric = _np.zeros((n,n)) current_diag_inds = _off_diagonal_indices(n,1) ultrametric[current_diag_inds] = fusion_levels for j in range(2,n): prev_diag_inds = current_diag_inds current_diag_inds = _off_diagonal_indices(n, j) prev_diag = ultrametric[prev_diag_inds] current_diag = _np.maximum(prev_diag[:-1], prev_diag[1:]) ultrametric[current_diag_inds] = current_diag ultrametric = ultrametric + ultrametric.T i,j = _np.triu_indices_from(ultrametric) shuffle = prng.permutation(_np.arange(n)) shuffled_ultrametric = _np.zeros_like(ultrametric) shuffled_ultrametric[i,j] = ultrametric[shuffle[i], shuffle[j]] return _distance.squareform(ultrametric, checks=False)
def test_pairplot(self): vars = ["x", "y", "z"] g = pairplot(self.df) for ax in g.diag_axes: nt.assert_equal(len(ax.patches), 10) for i, j in zip(*np.triu_indices_from(g.axes, 1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) for i, j in zip(*np.tril_indices_from(g.axes, -1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) for i, j in zip(*np.diag_indices_from(g.axes)): ax = g.axes[i, j] nt.assert_equal(len(ax.collections), 0) plt.close("all")
def test_map_diag_and_offdiag(self): vars = ["x", "y", "z"] g = ag.PairGrid(self.df) g.map_offdiag(plt.scatter) g.map_diag(plt.hist) for ax in g.diag_axes: nt.assert_equal(len(ax.patches), 10) for i, j in zip(*np.triu_indices_from(g.axes, 1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) for i, j in zip(*np.tril_indices_from(g.axes, -1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) for i, j in zip(*np.diag_indices_from(g.axes)): ax = g.axes[i, j] nt.assert_equal(len(ax.collections), 0)
def plot_corr(file, score, stat, ind_var, brain_type): # seaborn sns.set(style="white") # import the dataframe dt = pd.read_csv(file) # Compute the correlation matrix corr = dt.corr() ### Create the matrix figure with seaborn # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(len(ind_var),len(ind_var))) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, annot=False, ax=ax) plt.subplots_adjust(left= 0.30,bottom=0.30) plt.savefig(os.path.join(stat,score, "heatmap_" + score + "_" + stat + "_"+ brain_type + ".png")) plt.close() return corr
def map_upper(self, func, **kwargs): """Plot with a bivariate function on the upper diagonal subplots. Parameters ---------- func : callable plotting function Must take x, y arrays as positional arguments and draw onto the "currently active" matplotlib Axes. """ kw_color = kwargs.pop("color", None) for i, j in zip(*np.triu_indices_from(self.axes, 1)): hue_grouped = self.data.groupby(self.hue_vals) for k, (label_k, data_k) in enumerate(hue_grouped): ax = self.axes[i, j] plt.sca(ax) x_var = self.x_vars[j] y_var = self.y_vars[i] color = self.palette[k] if kw_color is None else kw_color func(data_k[x_var], data_k[y_var], label=label_k, color=color, **kwargs) self._clean_axis(ax) self._update_legend_data(ax) if kw_color is not None: kwargs["color"] = kw_color
def unpad_randomize_and_flatten(self, cm): """ 1. Remove zero padding on Coulomb Matrix 2. Randomly permute the rows and columns for n_samples 3. Flatten each sample to upper triangular portion Returns list of feature vectors """ max_atom_number = len(cm) atom_number = 0 for i in cm[0]: if atom_number == max_atom_number: break elif i != 0.: atom_number += 1 else: break upcm = cm[0:atom_number,0:atom_number] row_norms = np.asarray( [np.linalg.norm(row) for row in upcm], dtype=float) rng = np.random.RandomState(self.seed) e = rng.normal(size=row_norms.size) p = np.argsort(row_norms+e) rcm = upcm[p][:,p] rcm = pad_array(rcm, len(cm)) rcm = rcm[np.triu_indices_from(rcm)] return rcm
def LML_se(self,theta,returnGradients=False): self.setTheta(theta) K,r = self.cov(self.X,retr=True) Ky = K.copy() Ky += np.eye(self.X.shape[0])*self.var_n + np.eye(self.X.shape[0])*1e-8 L = self.cholSafe(Ky) WlogDet = 2.*np.sum(np.log(np.diag(L))) alpha, status = dpotrs(L, self.Y, lower=1) dataFit = - np.sum(alpha * self.Y) modelComplexity = -self.Y.shape[1] * WlogDet normalizer = -self.Y.size * log2pi logMarginalLikelihood = 0.5*(dataFit + modelComplexity + normalizer) if returnGradients == False: return logMarginalLikelihood else: Wi, status = dpotri(-L, lower=1) Wi = np.asarray(Wi) # copy bottom triangle to top triangle triu = np.triu_indices_from(Wi,k=1) Wi[triu] = Wi.T[triu] # dL = change in LML, dK is change in Kernel(K) dL_dK = 0.5 * (np.dot(alpha,alpha.T) - self.Y.shape[1] * Wi) dL_dVarn = np.diag(dL_dK).sum() varfGradient = np.sum(K* dL_dK)/self.var_f dK_dr = -r*K dL_dr = dK_dr * dL_dK lengthscaleGradient = -np.sum(dL_dr*r)/self.charLen grads = np.array([varfGradient, lengthscaleGradient, dL_dVarn]) return logMarginalLikelihood, grads
def compute_PR_vectors(corr_mat, batches, verbosity): triu_rows, triu_cols = np.triu_indices_from(corr_mat, k = 1) corr_vec = corr_mat[triu_rows, triu_cols] batches_1 = batches[triu_rows] batches_2 = batches[triu_cols] batches_match = np.array(batches_1 == batches_2, dtype = np.int) # Get the number of true positives so we can use that as recall instead of # a fraction from 0 to 1. num_true_positives = np.sum(batches_match) if verbosity >= 1: print '\t\tnumber of NaN correlations: {}'.format(np.sum(np.isnan(corr_vec))) precision, recall, thresholds = precision_recall_curve(y_true = batches_match, probas_pred = corr_vec) # This calculates the "normalized" AUC, since the recall values have not been multiplied by the # number of true positives yet. norm_AUPR = auc(recall, precision) # Reverse the orders so that they can easily be printed to file # with the highest recalls first return precision[::-1], recall[::-1] * num_true_positives, thresholds[::-1], norm_AUPR
def get_candidate_taus_above_threshold(Ds, thresh, **kwargs): upper_tri_Ds = Ds[np.triu_indices_from(Ds, k=1)] if "nz_frac" in kwargs: nz_frac = float(kwargs["nz_frac"]) common.print_log("Setting tau so that fraction of distances below threshold = {0}".format(nz_frac)) all_taus = np.array(sorted(upper_tri_Ds)) n_all_taus = len(all_taus) idx = min(max(int(nz_frac*n_all_taus), 0), n_all_taus-1) tau = all_taus[idx] if tau < thresh: common.print_log("Parameter tau was set below the minimum value which makes the graph connected. Changing it to {0}".format(thresh)) tau = thresh candidate_taus = np.array([tau]) else: grid_size = int(kwargs.get("grid_size", 20)) linspace_tau = bool(kwargs.get("linspace_tau", False)) if linspace_tau: candidate_taus = np.linspace(thresh, np.max(Ds[Ds > 0]), grid_size) else: all_taus = np.array(sorted(upper_tri_Ds[upper_tri_Ds > thresh])) n_all_taus = len(all_taus) tau_indices = np.asarray(np.concatenate([np.linspace(0, 1, grid_size)]) * (n_all_taus - 1), dtype=int) candidate_taus = sorted(all_taus[tau_indices]) nz_fracs = [100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds) for tau in candidate_taus] common.print_log("Found {0} candidate thresholds:".format(len(candidate_taus)), candidate_taus) common.print_log("Percentage of distances below threshold:", nz_fracs) return candidate_taus
def test_pairplot(self): vars = ["x", "y", "z"] g = ag.pairplot(self.df) for ax in g.diag_axes: assert len(ax.patches) > 1 for i, j in zip(*np.triu_indices_from(g.axes, 1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) for i, j in zip(*np.tril_indices_from(g.axes, -1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) for i, j in zip(*np.diag_indices_from(g.axes)): ax = g.axes[i, j] nt.assert_equal(len(ax.collections), 0) g = ag.pairplot(self.df, hue="a") n = len(self.df.a.unique()) for ax in g.diag_axes: assert len(ax.lines) == n assert len(ax.collections) == n
def test_pairplot_reg(self): vars = ["x", "y", "z"] g = ag.pairplot(self.df, diag_kind="hist", kind="reg") for ax in g.diag_axes: nt.assert_equal(len(ax.patches), 10) for i, j in zip(*np.triu_indices_from(g.axes, 1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) nt.assert_equal(len(ax.lines), 1) nt.assert_equal(len(ax.collections), 2) for i, j in zip(*np.tril_indices_from(g.axes, -1)): ax = g.axes[i, j] x_in = self.df[vars[j]] y_in = self.df[vars[i]] x_out, y_out = ax.collections[0].get_offsets().T npt.assert_array_equal(x_in, x_out) npt.assert_array_equal(y_in, y_out) nt.assert_equal(len(ax.lines), 1) nt.assert_equal(len(ax.collections), 2) for i, j in zip(*np.diag_indices_from(g.axes)): ax = g.axes[i, j] nt.assert_equal(len(ax.collections), 0)
def scatterplot_matrix(data, attNames, **kwargs): rows, atts = data.shape fig, axes = plt.subplots(nrows = atts, ncols =atts, figsize=(30,30)) fig.subplots_adjust(hspace = 0.05 , wspace = 0.05) for ax in axes.flat: ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) if ax.is_first_col(): ax.yaxis.set_ticks_position('left') if ax.is_last_col(): ax.yaxis.set_ticks_position('right') if ax.is_first_row(): ax.xaxis.set_ticks_position('top') if ax.is_last_row(): ax.xaxis.set_ticks_position('bottom') for i, j in zip(*np.triu_indices_from(axes, k=1)): for x, y in [(i,j), (j,i)]: axes[x,y].plot(data[y], data[x], **kwargs) # Label the diagonal subplots... for i, label in enumerate(attNames): axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction', ha='center', va='center') for i, j in zip(range(atts), itertools.cycle((-1, 0))): axes[j,i].xaxis.set_visible(True) axes[i,j].yaxis.set_visible(True) return fig
def plot_corr(df, size=10): """Function plots a graphical correlation matrix for each pair of columns in the dataframe. Input: df: pandas DataFrame size: vertical and horizontal size of the plot""" import matplotlib.pyplot as plt from matplotlib import cm import numpy as np corr = df.corr() label = df.corr() mask = np.tri(corr.shape[0], k=-1) corr = np.ma.array(corr, mask=mask) mask[np.triu_indices_from(mask)] = True fig, ax = plt.subplots(figsize=(size, size)) ax.matshow(corr) cmap = cm.get_cmap("jet", 10) cmap.set_bad("w") plt.xticks(range(len(label.columns)), label.columns, rotation=90) plt.yticks(range(len(label.columns)), label.columns) ax.imshow(corr, interpolation="nearest", cmap=cmap) plt.show()
def tangent_space(covmats, Cref): """Project a set of covariance matrices in the tangent space according to the given reference point Cref :param covmats: Covariance matrices set, Ntrials X Nchannels X Nchannels :param Cref: The reference covariance matrix :returns: the Tangent space , a matrix of Ntrials X (Nchannels*(Nchannels+1)/2) """ Nt, Ne, Ne = covmats.shape Cm12 = invsqrtm(Cref) idx = numpy.triu_indices_from(Cref) T = numpy.empty((Nt, Ne * (Ne + 1) / 2)) coeffs = ( numpy.sqrt(2) * numpy.triu( numpy.ones( (Ne, Ne)), 1) + numpy.eye(Ne))[idx] for index in range(Nt): tmp = numpy.dot(numpy.dot(Cm12, covmats[index, :, :]), Cm12) tmp = logm(tmp) T[index, :] = numpy.multiply(coeffs, tmp[idx]) return T
def scatterplot_matrix(data, names, **kwargs): """Plots a scatterplot matrix of subplots. Each row of "data" is plotted against other rows, resulting in a nrows by nrows grid of subplots with the diagonal subplots labeled with "names". Additional keyword arguments are passed on to matplotlib's "plot" command. Returns the matplotlib figure object containg the subplot grid.""" numvars, numdata = data.shape fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8)) fig.subplots_adjust(hspace=0.05, wspace=0.05) for ax in axes.flat: # Hide all ticks and labels ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) # Set up ticks only on one side for the "edge" subplots... if ax.is_first_col(): ax.yaxis.set_ticks_position('left') if ax.is_last_col(): ax.yaxis.set_ticks_position('right') if ax.is_first_row(): ax.xaxis.set_ticks_position('top') if ax.is_last_row(): ax.xaxis.set_ticks_position('bottom') # Plot the data. for i, j in zip(*np.triu_indices_from(axes, k=1)): for x, y in [(i,j), (j,i)]: axes[x,y].plot(data[x], data[y], **kwargs)
def convert_file(in_file, out_file, factors=[.25, 1, 4]): with h5py.File(in_file, 'r') as inp: func_ks = [ (df, k) for df, g in inp.iteritems() if df != '_meta' for k in g.iterkeys() ] meds = {} for df, k in func_ks: with h5py.File(in_file, 'r') as inp: divs = inp[df][k][()] if df in meds: med = meds[df] else: meds[df] = med = np.median(divs[np.triu_indices_from(divs)]) for factor in factors: name = 'median * {}'.format(factor) print '/'.join((df, k, name)) with h5py.File(out_file) as out: g = out.require_group(df).require_group(k) if name in g: print '\talready there' continue km = sdm.sdm.make_km(divs, med * factor) with h5py.File(out_file) as out: out[df][k][name] = km
def plot_2_corr_heatmaps(corr1, corr2, labels, title1, title2): fig=plt.figure(figsize=(9, 8)) gs = gridspec.GridSpec(1, 2) ax1 = fig.add_subplot(gs[0, 0]) ax2 = fig.add_subplot(gs[0, 1]) sns.set(style="white") # Generate a mask for the upper triangle mask = np.zeros_like(corr1, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr1, mask=mask, cmap=cmap, vmax=.3, square=True, xticklabels=labels, yticklabels=labels, linewidths=.5, ax=ax1, cbar_kws={"shrink": .3}, annot=True) ax1.set_title(title1) sns.heatmap(corr2, mask=mask, cmap=cmap, vmax=.3, square=True, xticklabels=labels, yticklabels=labels, linewidths=.5, ax=ax2, cbar_kws={"shrink": .3}, annot=True) ax2.set_title(title2) fig.tight_layout() plt.show()
def _process(self,data): for x in data: if data[x][1] not in self.data: #prepares the data to visualise the xcor matrix of a specific batch number. self.data[data[x][1]]={} self.data[data[x][1]]['matrix']=numpy.identity(self.size) self.data[data[x][1]]['ro_count']=0 self.data[data[x][1]]['matrix'][(data[x][2][1],data[x][2][0])]=data[x][0] #self.addToProvState('batch_'+str(data[x][1]),self.data[data[x][1]]['matrix'],metadata={'matrix':str(self.data[data[x][1]]['matrix'])},dep=['batch_'+str(data[x][1])],ignore_inputs=False) self.data[data[x][1]]['ro_count']+=1 if self.data[data[x][1]]['ro_count']==(self.size*(self.size-1))/2: matrix=self.data[data[x][1]]['matrix'] d = pd.DataFrame(data=matrix, columns=range(0,self.size),index=range(0,self.size)) mask = numpy.zeros_like(d, dtype=numpy.bool) mask[numpy.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(d, mask=mask, cmap=cmap, vmax=1, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) sns.plt.savefig("./plots/"+str(data[x][1])+"_plot.png") self.write('output',(matrix,data[x][1]),metadata={'matrix':str(d),'batch':str(data[x][1])},dep=['batch_'+str(data[x][1])])
def main(): # Load list of pointing IDs todo_file = rawdata_dir + 'todo_list.ascii.dat' ID_list = np.genfromtxt(todo_file, skip_header=1, usecols=[0], unpack=True, dtype=str) N_los = len(ID_list) # Load bins centers bins_file = 'rbins.ascii.dat' bin_centers = np.genfromtxt(bins_file, skip_header=1, usecols=[2], unpack=True) N_bins = len(bin_centers) # Round bin centers to three decimal places bin_centers = np.round(bin_centers, 3) # Make array of column names for pandas Dataframe col_names = [] for i in range(N_bins): name = str(bin_centers[i]) col_names.append(name) # Recast as array col_names = np.asarray(col_names) # Create list of png's for use in making gif png_list =[] # Calculate correlation matrix for each l.o.s. for ID in ID_list: # Load counts from 1000 mocks with pandas # Each row is a mock, each column is a bin counts_filename = counts_dir + 'counts_all_' + ID + '.dat' DF = pd.read_csv(counts_filename, sep='\s+', names=col_names) # Calculate correlation matrix corr = DF.corr() # plot heatmap of matrix plt.clf() sns.set(style="white") mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True f, ax = plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(145, 280, s=85, l=25, n=7, as_cmap=True) sns.heatmap(corr, mask=mask, cmap=cmap,square=True, annot=True, xticklabels=col_names, yticklabels=col_names, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax, vmin=-1.0, vmax=1.0) plt.title('Correlation Matrix for l.o.s. ' + ID, fontsize=20) plt.xlabel('Bin Center (kpc)', fontsize=18) plt.ylabel('Bin Center (kpc)', fontsize=18) fig_name = plots_dir + 'corr_matrix_' + ID + '.png' plt.savefig(fig_name) png_list.append(fig_name) gif_name = plots_dir + 'corr_matrix.gif' GIF_MOVIE(png_list, gif_name)
def find_collinearity_columns(correlation): """ 本函数找出多重共线性的列。 基本思路: 0.首先看correlation是否满秩。如果不满秩,说明存在多重共线性。 1. 找到correlation里面绝对值最大的row_idx,和col_idx,假设为A和C. 2.计算A列/C列与其他列的相关系数绝对值的均值,如果A列与其他列相关系数更大,则剔除A列,反之亦然。记录下来A列名 重复以上步骤 @params: correlation, 相关系数矩阵.dataframe @returns: list of column names. """ bad_columns = [] while True: rank = np.linalg.matrix_rank(correlation.values) if rank == correlation.shape[0]: break correlation_copy = correlation.copy() correlation = correlation.abs() correlation.values[np.triu_indices_from(correlation.values, 0)] = 0.0 # 把上三角(包括对角线部分)设置为0. col_idx, row_idx = correlation.unstack().argmax() # (col_idx, row_idx) if correlation_copy.ix[row_idx, :].mean() > correlation_copy.ix[:, col_idx].mean(): bad_column = row_idx else: bad_column = col_idx bad_columns.append(bad_column) # 把该列名称从相关系数矩阵的行/列里去掉 correlation_copy.drop(bad_column, axis=0, inplace=True) correlation_copy.drop(bad_column, axis=1, inplace=True) correlation = correlation_copy return bad_columns
def plot_feature_corr(X, f_sz = (11, 9)): """ Purpose: plot a correlation matrix for the features in X Inputs: X: a pandas dataframe of feature values f_sz: a tuple for the figure size Output: the correlation matrix of X """ sns.set(style="white") # Compute the correlation matrix corr = X.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize= f_sz) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) return corr
def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console): Tk.Frame.__init__(self, master) self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.evaluator = evaluator self.df = df self.console = console frame_train = Tk.Frame(self) frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15) plt.figure(figsize=(12, 20)) plt.subplot(111) # 背景色白色 sns.set(style="white") # 特征关联矩阵(矩阵里不仅包含特征,还包括类别) corr = df.corr() # 隐藏矩阵的上三角 mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # 画图 f, ax = plt.subplots(figsize=(11, 11)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) plt.xticks(rotation=-90) plt.yticks(rotation=0) plt.title("Cardiotocography \"Feature-Feature\" & \"Feature-Label\" Correlations") self.attach_figure(plt.gcf(), frame_train)
def threshold_matrix(M, cost): ''' M is the full association matrix. cost is the percentage (0 to 100) at which you'd like to threshold threshold_matrix first creates a copy of the input matrix, then sets all diagonal values to 0. It next calculates the minimum spanning tree, and ensures that those edges are *always* included in the thresholded matrix. then sets all values below the appropriate percentile to 0 ''' # Make a copy of the matrix thr_M = np.copy(M) # Set all diagonal values to -999 thr_M[np.diag_indices_from(thr_M)] = -999 # Calculate minmum spanning tree G = nx.from_numpy_matrix(M) mst = nx.minimum_spanning_tree(G, weight='weight'*-1) # Calculate the threshold value thr = np.percentile(thr_M[np.triu_indices_from(thr_M, k=1)], cost) # Set all values that are less than the threshold to 0 thr_M[thr_M < thr] = 0 # Set all values that are not zero to 1 thr_M[thr_M != 0] = 1 return thr_M
def remove_lower_triangle(matrix): """ remove all values in the lower triangle of a matrix """ return matrix[np.triu_indices_from(matrix)].A1
def dynamic_heatmap(df, columns, fontsize=20, annot=False, palette=None, figsize=(15, 10), squaresize=500): """Plots a heatmap that changes size values depending on correlation Adapted from: https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec""" plt.figure(figsize=figsize) corr = df[columns].corr() sns.set(style="dark") grid_bg_color = sns.axes_style()['axes.facecolor'] # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True corr = pd.melt(corr.reset_index(), id_vars='index') # Unpivot the dataframe, so we can get pair of arrays for x and y corr.columns = ['x', 'y', 'value'] x = corr['x'] y = corr['y'] size = corr['value'].abs() # Set up the matplotlib figure f, ax = plt.subplots(figsize=figsize) ax.set_xticklabels( ax.get_xticklabels(), rotation=45, horizontalalignment='right'); # Mapping from column names to integer coordinates x_labels = [v for v in sorted(x.unique())] y_labels = [v for v in sorted(y.unique())] x_to_num = {p[1]: p[0] for p in enumerate(x_labels)} y_to_num = {p[1]: p[0] for p in enumerate(y_labels)} size_scale = squaresize if palette: n_colors = len(palette) else: n_colors = 256 # Use 256 colors for the diverging color palette palette = sns.diverging_palette(20, 220, n=n_colors) # Create the palette color_min, color_max = [-1, 1] # Range of values that will be mapped to the palette, i.e. min and max possible correlation color = corr["value"] def value_to_color(val): val_position = float((val - color_min)) / ( color_max - color_min) # position of value in the input range, relative to the length of the input range ind = int(val_position * (n_colors - 1)) # target index in the color palette return palette[ind] plot_grid = plt.GridSpec(1, 15, hspace=0.2, wspace=0.1) # Setup a 1x15 grid ax = plt.subplot(plot_grid[:, :-1]) # Use the leftmost 14 columns of the grid for the main plot ax.scatter( x=x.map(x_to_num), # Use mapping for x y=y.map(y_to_num), # Use mapping for y s=size * size_scale, # Vector of square sizes, proportional to size parameter c=color.apply(value_to_color), # Vector of square colors, mapped to color palette marker='s' # Use square as scatterplot marker ) # Show column labels on the axes ax.set_xticks([x_to_num[v] for v in x_labels]) ax.set_xticklabels(x_labels, rotation=45, horizontalalignment='right') ax.set_yticks([y_to_num[v] for v in y_labels]) ax.set_yticklabels(y_labels) # ax.set_fontsize(font_scale) for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(fontsize) numbers = corr['value'].round(decimals=2) if annot: for i, txt in enumerate(numbers): annot_font_size = int(fontsize * size[i] * annot) ax.annotate(txt, (x.map(x_to_num)[i], y.map(x_to_num)[i]), horizontalalignment="center", verticalalignment="center", color=grid_bg_color, fontweight="black", fontsize=annot_font_size) ax.grid(False, 'major') ax.grid(True, 'minor') ax.set_xticks([t + 0.5 for t in ax.get_xticks()], minor=True) ax.set_yticks([t + 0.5 for t in ax.get_yticks()], minor=True) ax.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5]) ax.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5]) # Add color legend on the right side of the plot ax = plt.subplot(plot_grid[:, -1]) # Use the rightmost column of the plot col_x = [0] * len(palette) # Fixed x coordinate for the bars bar_y = np.linspace(color_min, color_max, n_colors) # y coordinates for each of the n_colors bars bar_height = bar_y[1] - bar_y[0] ax.barh( y=bar_y, width=[5] * len(palette), # Make bars 5 units wide left=col_x, # Make bars start at 0 height=bar_height, color=palette, linewidth=0 ) ax.set_xlim(1, 2) # Bars are going from 0 to 5, so lets crop the plot somewhere in the middle ax.grid(False) # Hide grid ax.set_xticks([]) # Remove horizontal ticks ax.set_yticks(np.linspace(min(bar_y), max(bar_y), 3)) # Show vertical ticks for min, middle and max ax.yaxis.tick_right() # Show vertical ticks on the right plt.show()
def correlation_reduction_worker(corrMatrix, target, threshold, reductionMetrics=''): ''' This function works to drop vars over threshold and heat map a pre and post correlation matrix of any sorts. The threshold can also be specify through the scenario file by "custom_correlationReduction_thresh" Parameters: corrMatrix - correlation matrix target - dataframe containing variables' correlation with your target variable threshold - your threshold for variable reduction reductionMetrics - one of four correlation reduction metrics: Pearson, Spearmans, EuclideanDistances or LogisticRegression Returns: droppedInfo - dataFrame of variables that were dicarded corrmat - post correlation matrix of any sorts Writes: droppedInfo - dataFrame of variables that were dicarded corrmat - post correlation matrix of any sorts ''' drop = [] seaborn.set(context="paper", font="monospace") corrmat = abs(corrMatrix.copy(deep=True)) # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 8)) ax.set_yticks([]) # Draw the heatmap using seaborn mask = np.zeros_like(corrmat, dtype=np.bool) mask[np.triu_indices_from(mask)] = True with seaborn.axes_style("white"): seaborn.heatmap(corrmat, vmax=1, mask=mask, square=True, xticklabels=20, yticklabels=20, cmap='Blues') ax.set_title( "{0} - Original Correlation Matrix".format(reductionMetrics)) if not os.path.isfile('../outputs/variable_reduction/{}_Original.png'. format(reductionMetrics)): f.savefig('../outputs/variable_reduction/{}_Original.png'.format( reductionMetrics)) # Correlation Threshold mid = target.drop(target.index[0], 1).T mid['target_abs'] = abs(mid[target.index[0]]) mid.sort_values('target_abs', inplace=True, ascending=False) order = list(mid.index) corrMatrix = corrMatrix.reindex(order, axis=1) corrMatrix = corrMatrix.reindex(order) column_list = corrMatrix.keys() ### for col in column_list: ### # This is a temporary drop list that'll be emptied in each iteration, we use it to store the dropped vars and apply to corrMatrix drop drop_tmp = [] if col in corrMatrix.keys( ): # corrMatrix does inplace drop of columns and rows in each iteration, need to check whether col is still in corrMatrix for i in range(len(corrMatrix)): # Iterating through all the vars if col != corrMatrix.keys( )[i]: # Make sure col is not compared with itself if abs(corrMatrix[col] [i]) > threshold: # Correlation Threshold Checking # This is a cumulative drop list that'll be used as the output to show the summary of drop vars # Append [dropped_var, col, corr(dropped_var, target), corr(col, target)] drop.append([ corrMatrix.keys()[i], col, target[corrMatrix.keys()[i]].values[0], target[col].values[0] ]) drop_tmp.append( corrMatrix.keys()[i]) # Append dropped_var corrMatrix.drop(drop_tmp, axis=1, inplace=True) # Drop columns corrMatrix.drop(drop_tmp, axis=0, inplace=True) # Drop rows corrmat = abs(corrMatrix.copy(deep=True)) droppedInfo = pd.DataFrame(drop, columns=[ 'discarded_variable', 'correlated_to', 'discarded_variable_correlation_to_target', 'correlated_to_correlation_to_target' ]) # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 8)) ax.set_yticks([]) # Draw the heatmap using seaborn mask = np.zeros_like(corrmat, dtype=np.bool) mask[np.triu_indices_from(mask)] = True with seaborn.axes_style("white"): seaborn.heatmap(corrmat, vmax=1, mask=mask, square=True, xticklabels=20, yticklabels=20, cmap='Blues') ax.set_title("{} - Post Reduction Correlation Matrix - {}".format( reductionMetrics, threshold)) f.savefig( '../outputs/variable_reduction/{}_Post_Reduction_Correlation_Matrix_{}.png' .format(reductionMetrics, str(threshold).replace('.', ''))) return corrmat, droppedInfo
def pipeline_for_single_instance(logger, analysis_dir, main: Record, finetune: List[Record], by: str, gt: np.ndarray): logger.info("Analysing results for {}".format(analysis_dir)) main_df = main.validation_acc_dataframe(by) main_archit = main.grouping_subgraph_training_dataframe(by) main_grouping = main.grouping_numpy os.makedirs(analysis_dir, exist_ok=True) # Save raw data main_df.to_csv(os.path.join(analysis_dir, "val_acc_all_epochs.csv"), index=True) np.savetxt(os.path.join(analysis_dir, "group_info.txt"), main_grouping, "%d") # correlation between subgraphs corr_matrix = main_df.corr().values heatmap(corr_matrix, filepath=os.path.join(analysis_dir, "corr_heatmap")) np.savetxt(os.path.join(analysis_dir, "corr_heatmap.txt"), corr_matrix) # Consecutive tau (single) consecutive_taus = get_consecutive_rank_tau(main_df) lineplot([np.array(list(zip(main_df.index[1:], consecutive_taus)))], filepath=os.path.join(analysis_dir, "consecutive_tau_single")) # GT rank (for color reference) gt_rank = rankdata_greater(gt) gt_rank_color = 1 - gt_rank / EXPECTED_SUBGRAPH_NUMBER # in some cases, it could be a subset of 64 subgraphs; process this later # Acc variance (lineplot) acc_curves = [ np.array(list(zip(main_df.index, main_df[i]))) for i in main_df.columns ] subgraph_markers = [[] for _ in range(EXPECTED_SUBGRAPH_NUMBER)] if len(main.groups) != len(main.columns): # hide it for ground truth for i, (_, row) in enumerate(main_archit.iterrows()): for k in filter(lambda k: k >= 0, row.values): subgraph_markers[k].append(i) else: logger.info("Markers hidden because groups == columns") lineplot(acc_curves, filepath=os.path.join(analysis_dir, "acc_curve_along_epochs"), color=[gt_rank_color[i] for i in main_df.columns], alpha=0.7, markers=[subgraph_markers[i] for i in main_df.columns], fmt=["-D"] * len(acc_curves)) # Rank version of df df_rank = main_df.apply(rankdata_greater, axis=1, result_type="expand") df_rank.columns = main_df.columns # Rank variance (lineplot) rank_curves = [ np.array(list(zip(df_rank.index, df_rank[i]))) for i in df_rank.columns ] lineplot(rank_curves, filepath=os.path.join(analysis_dir, "rank_curve_along_epochs"), color=[gt_rank_color[i] for i in df_rank.columns], alpha=0.7, inverse_y=True, markers=subgraph_markers) # Rank variance for top-5 subgraphs found at half and end # recalculate for original order for loc in [len(main_df) // 2, len(main_df) - 1]: selected_rank_curves = [ rank_curves[i] for i in np.argsort(-main_df.iloc[loc])[:5] ] lineplot(selected_rank_curves, inverse_y=True, filepath=os.path.join( analysis_dir, "rank_curves_along_epochs_for_ep{}".format( main_df.index[loc]))) # Rank variance (boxplot), sorted by the final rank boxplot(sorted(df_rank.values.T, key=lambda d: d[-1]), filepath=os.path.join( analysis_dir, "rank_boxplot_along_epochs_sorted_final_rank"), inverse_y=True) gt_order = np.argsort(-gt) # Group info np.savetxt(os.path.join(analysis_dir, "group_info_sorted_gt.txt"), main_grouping[gt_order], "%d") # Rank variance (boxplot), sorted by ground truth boxplot([df_rank[i] for i in gt_order if i in df_rank.columns], inverse_y=True, filepath=os.path.join(analysis_dir, "rank_boxplot_along_epochs_sorted_gt_rank")) boxplot([df_rank[i][-10:] for i in gt_order if i in df_rank.columns], inverse_y=True, filepath=os.path.join( analysis_dir, "rank_boxplot_along_epochs_sorted_gt_rank_last_10")) # Tau every epoch gt_tau_data = get_tau_along_epochs(main_df, gt, main.columns) report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window", gt_tau_data) lineplot([stack_with_index(main_df.index, gt_tau_data)], filepath=os.path.join(analysis_dir, "tau_curve_along_epochs")) if finetune: # Finetune curves for data in finetune: try: finetune_step = data.finetune_step if by == "epochs": finetune_step //= 196 half_length = len(main_df.loc[main_df.index <= finetune_step]) finetune_df = data.validation_acc_dataframe( by, cutoff=finetune_step).iloc[:half_length] if finetune_step < min( main_df.index) - 1 or finetune_step > max( main_df.index) + 1: continue finetune_df.index += finetune_step finetune_curves = [ np.array([[finetune_step, main_df.loc[finetune_step, i]]] + list(zip(finetune_df.index, finetune_df[i]))) for i in main_df.columns ] finetune_tau_curve = get_tau_along_epochs( finetune_df, gt, data.columns) finetune_colors = [ gt_rank_color[i] for i in finetune_df.columns ] logger.info( "Finetune step {}, found {} finetune curves".format( finetune_step, len(finetune_curves))) lineplot( [c[:half_length] for c in acc_curves] + finetune_curves, filepath=os.path.join( analysis_dir, "acc_curve_along_epochs_finetune_{}".format( finetune_step)), color=[gt_rank_color[i] for i in main_df.columns] + finetune_colors, alpha=0.7, fmt=["-"] * len(acc_curves) + [":"] * len(finetune_curves)) lineplot([ stack_with_index(main_df.index, gt_tau_data)[:half_length], np.concatenate((np.array([[ finetune_step, gt_tau_data[half_length - 1] ]]), stack_with_index(finetune_df.index, finetune_tau_curve))) ], filepath=os.path.join( analysis_dir, "tau_curve_along_epochs_finetune_{}".format( finetune_step)), color=["tab:blue", "tab:blue"], alpha=1, fmt=["-", ":"]) except ValueError: pass # Tau every epoch group by groups grouping_info_backup = main.grouping_info.copy() divide_group = main.group_number == 1 and len(main.columns) == 64 for partition_file in [None] + list(os.listdir("assets")): suffix = "" if partition_file is not None: if not partition_file.startswith("partition"): continue if not divide_group: continue suffix = "_" + os.path.splitext(partition_file)[0] # regrouping main.grouping_info = { idx: g for idx, g in enumerate( np.loadtxt(os.path.join("assets", partition_file), dtype=np.int)) } tau_curves_by_groups = get_tau_curves_by_groups( main_df, gt, main.grouping_numpy, main.groups) tau_curves_by_groups_mean = [ np.mean(tau_curves_by_groups[cur]) for cur in main.groups ] tau_curves_by_groups_std = [ np.std(tau_curves_by_groups[cur]) for cur in main.groups ] report_mean_std_max_min(analysis_dir, logger, "GT-Tau-By-Groups-Mean{}".format(suffix), np.array(tau_curves_by_groups_mean)) report_mean_std_max_min(analysis_dir, logger, "GT-Tau-By-Groups-Std{}".format(suffix), np.array(tau_curves_by_groups_std)) tau_curves_by_groups_for_plt = [ stack_with_index(main_df.index, tau_curves_by_groups[cur]) for cur in main.groups ] pd.DataFrame(tau_curves_by_groups, columns=main.groups, index=main_df.index).to_csv( os.path.join( analysis_dir, "tau_curves_by_groups{}.csv".format(suffix))) lineplot(tau_curves_by_groups_for_plt, filepath=os.path.join( analysis_dir, "tau_curves_by_groups{}".format(suffix))) # Acc curves (by group) with MultiPageContext( os.path.join( analysis_dir, "acc_curve_along_epochs_group_each{}".format( suffix))) as pdf: for g in range(main.group_number): subgraphs = np.where(main.grouping_numpy == g)[0] gt_rank_group = [gt_rank_color[i] for i in subgraphs] subgraph_names = list( map(convert_subgraph_index_to_label, subgraphs)) subgraph_names_ranks = [ "{} (Rank {})".format(name, gt_rank[i]) for name, i in zip(subgraph_names, subgraphs) ] # cannot leverage acc_curves, because it's a list, this can be a subset, which cannot be used as index lineplot([ np.array(list(zip(main_df.index, main_df[i]))) for i in subgraphs ] + [ stack_with_index(main_df.index, [gt[i]] * len(main_df.index)) for i in subgraphs ], context=pdf, color=gt_rank_group * 2, alpha=0.8, labels=subgraph_names_ranks, fmt=["-D"] * len(subgraphs) + ["--"] * len(subgraphs), markers=[subgraph_markers[i] for i in subgraphs] + [[]] * len(subgraphs), title="Group {}, Subgraph {} -- {}".format( g, "/".join(map(str, subgraphs)), "/".join(subgraph_names))) main.grouping_info = grouping_info_backup # Tau among steps for k in (10, 64): max_tau_calc = min(k, len(main_df)) tau_correlation = np.zeros((max_tau_calc, max_tau_calc)) for i in range(max_tau_calc): for j in range(max_tau_calc): tau_correlation[i][j] = stats.kendalltau( main_df.iloc[-i - 1], main_df.iloc[-j - 1])[0] heatmap(tau_correlation, filepath=os.path.join(analysis_dir, "tau_correlation_last_{}".format(k))) np.savetxt( os.path.join(analysis_dir, "tau_correlation_last_{}.txt".format(k)), tau_correlation) tau_correlation = tau_correlation[np.triu_indices_from(tau_correlation, k=1)] report_mean_std_max_min(analysis_dir, logger, "Tau-as-Corr-Last-{}".format(k), tau_correlation) # Calculate best tau and log ref_gt_acc, ref_gt_acc_tau = get_tau_along_epochs_combining_best_groups( main_df, gt, main_grouping, main.groups, main.columns) pd.DataFrame(ref_gt_acc).to_csv( os.path.join(analysis_dir, "acc_epochs_combining_different_epochs_sorted_gt.csv")) lineplot( [stack_with_index(np.arange(len(ref_gt_acc_tau)), ref_gt_acc_tau)], filepath=os.path.join( analysis_dir, "tau_curve_epochs_sorted_combining_different_epochs")) # Show subgraph for each batch scatterplot([ stack_with_index(main_archit.index, main_archit[col]) for col in main_archit.columns ], filepath=os.path.join(analysis_dir, "subgraph_id_for_each_batch_validated")) # Substituted with ground truth rank scatterplot([ stack_with_index(main_archit.index, gt_rank[main_archit[col]]) for col in main_archit.columns ], filepath=os.path.join( analysis_dir, "subgraph_rank_for_each_batch_validated"), inverse_y=True) # Top-K-Rank top_acc, top_rank = get_top_k_acc_rank(main_df.values, gt) plot_top_k_variance_chart(os.path.join(analysis_dir, "top_k_along_epochs"), main_df.index, top_acc, top_rank, gt, (1, 3)) # Observe last window (for diff. epochs) for k in ( 10, 64, ): report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window-Last-{}".format(k), gt_tau_data[-k:]) for v in (1, 3): report_mean_std_max_min(analysis_dir, logger, "Top-{}-Rank-Last-{}".format(v, k), top_rank[-k:, v - 1])
df.drop(["estimation"], axis=1, inplace=True) # aux = df.diff() # apply diff method to compute corr over non-stationary # df=aux f, (ax1,ax2) = plt.subplots(2,1, figsize=(6, 12), sharex=False) corr = df.corr() # Pearson corrdiff = df.diff().corr() # corr = df.corr(method='kendall') # corr = df.corr(method='spearman') mask = np.zeros_like(corr) mask[np.triu_indices_from(mask,k=1)] = True #k=1 p/ mostrar diagonal with sns.axes_style("white"): # ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True) sns.heatmap( corr, mask=mask, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True, annot=True, fmt = '.2f', linewidths=.5, ax=ax1 )
def scores(key, paths, config, as_dataframe=False, algo_idx=None): # print(key, paths) # key = 'enettv_0.1_0.5_0.1' # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1'] key_parts = key.split("_") algo = key_parts[algo_idx] if algo_idx is not None else None key_parts.remove(algo) if len(key_parts) > 0: try: params = [float(p) for p in key_parts] except: params = [None, None, None] print(algo, params) if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): print("Failed for key %s" % key) return None values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["proba_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits))] auc_splits = [roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits))] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1,alternative = 'greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1,alternative = 'greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5,alternative = 'greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5,alternative = 'greater') pvalue_bacc = binom_test(success[0]+success[1], s[0] + s[1], p=0.5,alternative = 'greater') for item in values: print(item["beta"].shape) # Beta's measures of similarity betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T print (betas.shape) # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0])]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i+1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append(float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['algo'] = algo scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores['pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores['pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores
def make_symmetric_random(NUM_GENES): tmp = np.random.rand(NUM_GENES, NUM_GENES) sym = (tmp + tmp.T) / 2 sym[np.triu_indices_from(sym)] = -sym[np.triu_indices_from(sym)] return sym
def build_ts_matric(df_init, win=20, lag=0, columns=list, rename=dict, period='fullyear'): #%% ''' period = ['fullyear', 'summer60days', 'pre60days'] ''' splits = df_init.index.levels[0] dates_full_orig = df_init.loc[0].index dates_RV_orig = df_init.loc[0].index[df_init.loc[0]['RV_mask']==True] if columns is None: columns = df_init.columns df_cols = df_init[columns] TrainIsTrue = df_init['TrainIsTrue'] list_test = [] for s in range(splits.size): TestIsTrue = TrainIsTrue[s]==False list_test.append(df_cols.loc[s][TestIsTrue]) df_test = pd.concat(list_test).sort_index() # shift precursor vs. tmax for c in df_test.columns[1:]: df_test[c] = df_test[c].shift(periods=-lag) # bin means df_test = df_test.resample(f'{win}D').mean() if period=='fullyear': dates_sel = dates_full_orig.strftime('%Y-%m-%d') elif period == 'summer60days': dates_sel = dates_RV_orig.strftime('%Y-%m-%d') elif period == 'pre60days': dates_sel = (dates_RV_orig - pd.Timedelta(60, unit='d')).strftime('%Y-%m-%d') # after resampling, not all dates are in their: dates_sel = pd.to_datetime([d for d in dates_sel if d in df_test.index] ) df_period = df_test.loc[dates_sel, :].dropna() if rename is not None: df_period = df_period.rename(rename, axis=1) corr, sig_mask, pvals = corr_matrix_pval(df_period, alpha=0.01) # Generate a mask for the upper triangle mask_tri = np.zeros_like(corr, dtype=np.bool) mask_tri[np.triu_indices_from(mask_tri)] = True mask_sig = mask_tri.copy() mask_sig[sig_mask==False] = True # removing meaningless row and column cols = corr.columns corr = corr.drop(cols[0], axis=0).drop(cols[-1], axis=1) mask_sig = mask_sig[1:, :-1] mask_tri = mask_tri[1:, :-1] # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 10)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, n=9, l=30, as_cmap=True) ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1E99, center=0, square=True, linewidths=.5, annot=False, annot_kws={'size':30}, cbar=False) sig_bold_labels = sig_bold_annot(corr, mask_sig) # Draw the heatmap with the mask and correct aspect ratio ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .8}, annot=sig_bold_labels, annot_kws={'size':30}, cbar=False, fmt='s') ax.tick_params(axis='both', labelsize=15, bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False) ax.set_xticklabels(corr.columns, fontdict={'fontweight':'bold', 'fontsize':25}) ax.set_yticklabels(corr.index, fontdict={'fontweight':'bold', 'fontsize':25}, rotation=0) #%% return
if filename: with open(filename[0]) as result_file: first_line = result_file.readline() test_size = int( re.search("Size of test set: (\d+)", first_line).group(1)) if t == "total": stable_core[classifier][algorithm][k] = int( re.search("Stable Core: (\d+)", first_line).group(1)) / test_size array = np.loadtxt(result_file, dtype=int, delimiter=",") array = 1.0 - (array[np.triu_indices_from(array, 1)] / test_size) overlap[t][classifier][algorithm][k] = np.mean(array) overlap_std[t][classifier][algorithm][k] = np.std( array) else: print("Missing overlap results for", t, algorithm, classifier, dataset) all_datapoints = np.empty(0) all_cores = [] count = 0 for filename in glob( f"{basedir}/compare_embedding_errors/{t}*/self_{classifier}*{algorithm}*{dataset}*.txt" ):
def corr(data, corr_method='spearman', annot=False, mask=True, line_width=1, line_color='black', color_grades=5, auto_sizing=True, palette='default', style='astetik', dpi=72, title='', sub_title='', x_label='', y_label='', legend=True, x_scale='linear', y_scale='linear', x_limit=None, y_limit=None, save=False): '''CORRELATION HEATMAP This is best used with less than 50 variables in the dataset. For best results, column labels should be clear and not too long. Inputs: a dataframe with several columns Features: Both categorical and continuous features will be used 1. USE ====== ast.compare(data=patients, x='hospital_stays', y=['died_hospital','died_out'], label_col='religion', transform=True) 1. USE ====== ast.box(data=patients, x='insurance', y='age', hue='expired') 2. PARAMETERS ============= 2.1 INPUT PARAMETERS -------------------- data :: a pandas dataframe -------------------- 2.2. PLOT PARAMETERS -------------------- corr_method :: The method that will be used for the correlation: - 'pearson' : standard correlation coefficient - 'kendall' : Kendall Tau correlation coefficient - 'spearman' : Spearman rank correlation annotation :: True if each cell will be annotated with the value mask :: If set to False, a rectangular shape will be drawn instead of a triangular shapeself. line_width :: the width of the white lines between each element. Better to set small when there are really many items. line_color :: the color of the lines between the elements e.g. 'black' auto_sizing :: If not True, then should be int value in inches which is used for both width and height. color_grades :: The number of colors/shades to use in total. 5 is default. Generally the best results come with 3 or 5 or 7, but looks better with more colors. ---------------------- 2.3. COMMON PARAMETERS ---------------------- palette :: One of the astetik palettes: 'default' 'colorblind' 'blue_to_red' 'blue_to_green' 'red_to_green' 'green_to_red' 'violet_to_blue' 'brown_to_green' 'green_to_marine' Or use any cmap, seaborn or matplotlib color or palette code, or hex value. style :: Use one of the three core styles: 'astetik' # white '538' # grey 'solarized' # sepia Or alternatively use any matplotlib or seaborn style definition. dpi :: the resolution of the plot (int value) title :: the title of the plot (string value) sub_title :: a secondary title to be shown below the title x_label :: string value for x-axis label y_label :: string value for y-axis label x_scale :: 'linear' or 'log' or 'symlog' y_scale :: 'linear' or 'log' or 'symlog' x_limit :: int or list with two ints y_limit :: int or list with two ints outliers :: Remove outliers using either 'zscore' or 'iqr' ''' # # # # # PREP STARTS # # # # # data = data.corr(method=corr_method) if mask == True: mask = np.zeros_like(data) mask[np.triu_indices_from(mask)] = True line_color = 'white' else: mask = None # # # # # PREP ENDS # # # # # # HEADER STARTS >>> palette = _header(palette, style, n_colors=color_grades, dpi=dpi) if auto_sizing == True: size = data.shape[0] / 2 + 5 # PLOT p, ax = plt.subplots(figsize=(size, size)) p = sns.heatmap(data, mask=mask, linewidths=line_width, linecolor=line_color, cmap=palette, annot=annot) # HEADER _titles(title, sub_title=sub_title) _footer(p, x_label, y_label, save=save, tight=False, despine=False) p.set_xticklabels(data, rotation=90) p.set_yticklabels(data, rotation=0)
def feature_corr(Ord, ftlist, taglist, n_class, cross='feature', corr='dtw'): ftlist = ftlist.T[Ord].apply(pd.Series).T ftrs = {'xa':[], 'ya':[], 'za':[], 'xw':[], 'yw':[], 'zw':[]} for i, k in enumerate(ftrs.keys()): ftrs[k] = ftlist.iloc[i] # only do one axis (zw) for sake of analysis across activity tag_class, feature = {a:[] for a in n_class}, {a:[] for a in n_class} for k, v in taglist.items(): tag_class[v[Ord]] += [k] target_df = ftrs['zw'] feature_no = len(target_df.iloc[0]) for act, nums in tag_class.items(): for no in nums: feature[act].append(target_df.loc[no]) feature[act] = np.array(feature[act]) # corr across acts for different features # feature_across_acts = [Mat]: length is no of features # Mat is n*n (n=length of features) feature_across_acts = [] for i in range(feature_no): Mat = [] for act1, mat1 in feature.items(): mat = [] for act2, mat2 in feature.items(): ft_1 = mat1[:, 0] ft_2 = mat2[:, 0] corrf = corrolate(ft_1, ft_2, corr=corr) mat.append(corrf) Mat.append(mat) feature_across_acts.append(np.array(Mat)) feature_across_feature = [] # mat is the sliding window series of features under specific act for act, mat in feature.items(): M = [] for i in range(feature_no): m = [] for j in range(feature_no): ft_1 = mat[:, i] ft_2 = mat[:, j] corrf = corrolate(ft_1, ft_2, corr=corr) m.append(corrf) M.append(m) feature_across_feature.append(np.array(M)) # print(feature_across_feature[0]) n = [i for i in range(feature_no)] # plot the heatmap of any individual feature if cross == 'feature': heat = feature_across_feature[0] else: heat = feature_across_acts[0] label = n_class if (heat is feature_across_acts[0]) else n sns.set_theme(style="white") mask = np.zeros_like(heat) mask[np.triu_indices_from(mask)] = True sns.heatmap(heat, annot=True, xticklabels=label, yticklabels=label, mask=mask)
def pipeline_for_inter_instance(logger, analysis_dir, data, by, gt): logger.info("Analysing results for {}".format(analysis_dir)) data_as_df = [d.validation_acc_dataframe(by) for d in data] os.makedirs(analysis_dir, exist_ok=True) subgraphs = data[0].columns for d in data: assert d.columns == subgraphs final_acc = np.zeros((len(data), len(subgraphs))) for i, df in enumerate(data_as_df): final_acc[i] = df.iloc[-1] # Consecutive tau (multi) lineplot([ np.array(list(zip(df.index[1:], get_consecutive_rank_tau(df)))) for df in data_as_df ], filepath=os.path.join(analysis_dir, "taus_consecutive_epochs")) # Final acc distribution boxplot(final_acc, filepath=os.path.join(analysis_dir, "final_acc")) # Final rank distribution final_rank = np.stack([rankdata_greater(row) for row in final_acc]) boxplot(final_rank, filepath=os.path.join(analysis_dir, "final_rank_boxplot"), inverse_y=True) # GT-Tau gt_tau = np.array( [stats.kendalltau(row, gt[subgraphs])[0] for row in final_acc]) np.savetxt(os.path.join(analysis_dir, "inst_gt_tau.txt"), gt_tau) report_mean_std_max_min(analysis_dir, logger, "GT-Tau", gt_tau) # Tau every epoch tau_data = [get_tau_along_epochs(df, gt, subgraphs) for df in data_as_df] tau_data_mean_over_instances = np.mean(np.stack(tau_data, axis=0), axis=0) report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window", np.concatenate(tau_data)) tau_curves = [ stack_with_index(df.index, tau_d) for df, tau_d in zip(data_as_df, tau_data) ] lineplot(tau_curves, filepath=os.path.join(analysis_dir, "tau_curve_along_epochs")) for k in (10, 64): tau_data_clip = [t[-k:] for t in tau_data] report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window-Last-{}-Mean".format(k), np.array([np.mean(t) for t in tau_data_clip])) report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window-Last-{}-Std".format(k), np.array([np.std(t) for t in tau_data_clip])) report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window-Last-{}-Max".format(k), np.array([np.max(t) for t in tau_data_clip])) report_mean_std_max_min(analysis_dir, logger, "GT-Tau-In-Window-Last-{}-Min".format(k), np.array([np.min(t) for t in tau_data_clip])) acc_data = [np.mean(df.iloc[-k:].values, axis=0) for df in data_as_df] report_mean_std_max_min(analysis_dir, logger, "Acc-Mean-In-Window-Last-{}-Mean".format(k), np.array([np.mean(x) for x in acc_data])) report_mean_std_max_min(analysis_dir, logger, "Acc-Mean-In-Window-Last-{}-Std".format(k), np.array([np.std(x) for x in acc_data])) # S-Tau (last 5 epochs) s_tau = np.zeros((min(map(lambda d: len(d), data_as_df)), len(data), len(data))) for k in range(len(s_tau)): for i, table1 in enumerate(data_as_df): for j, table2 in enumerate(data_as_df): s_tau[k][i][j], _ = stats.kendalltau(table1.iloc[k], table2.iloc[k]) np.savetxt(os.path.join(analysis_dir, "inter_inst_s_tau.txt"), s_tau[-1]) heatmap(s_tau[0], filepath=os.path.join(analysis_dir, "inter_inst_last_s_tau_heatmap"), figsize=(10, 10)) if len(data) > 1: upper = np.triu_indices_from(s_tau[0], k=1) report_mean_std_max_min(analysis_dir, logger, "S-Tau-Last", s_tau[-1][upper]) s_tau_mean = np.mean(s_tau[:, upper[0], upper[1]], axis=1) s_tau_std = np.std(s_tau[:, upper[0], upper[1]], axis=1) report_mean_std_max_min(analysis_dir, logger, "S-Tau-Min", s_tau[np.argmin(s_tau_mean)][upper]) s_tau_errorbar = np.stack( [np.arange(len(s_tau)), s_tau_mean, s_tau_std], axis=1) errorbar([s_tau_errorbar], filepath=os.path.join(analysis_dir, "inter_inst_s_tau_curve")) # S-Tau (without variance) lineplot([s_tau_errorbar[:, :2]], fmt=["-o"], filepath=os.path.join( analysis_dir, "inter_inst_s_tau_curve_along_epochs_without_var")) # Compare with GT-Tau lineplot(tau_curves + [s_tau_errorbar], fmt=["-"] * len(tau_curves) + [":"], filepath=os.path.join( analysis_dir, "tau_curve_along_epochs_compare_to_s_tau")) lineplot([ np.stack([ np.arange(len(tau_data_mean_over_instances)), tau_data_mean_over_instances ], axis=1) ] + [s_tau_errorbar], fmt=["-", ":"], filepath=os.path.join( analysis_dir, "tau_curve_along_epochs_mean_compare_to_s_tau")) # Final rank dist (sorted by GT) gt_rank = sorted(np.arange(len(subgraphs)), key=lambda i: gt[subgraphs[i]], reverse=True) final_rank_resorted = final_rank[:, gt_rank] boxplot(final_rank_resorted, filepath=os.path.join(analysis_dir, "final_rank_boxplot_sorted_gt"), inverse_y=True) # Tau sorted ref_gt_acc_taus = [] for df, raw in zip(data_as_df, data): _, ref_gt_acc_tau = get_tau_along_epochs_combining_best_groups( df, gt, raw.grouping_numpy, raw.groups, subgraphs) ref_gt_acc_taus.append( stack_with_index(np.arange(len(ref_gt_acc_tau)), ref_gt_acc_tau)) lineplot(ref_gt_acc_taus, filepath=os.path.join( analysis_dir, "tau_curves_sorted_combining_different_epochs")) # Top-K-Rank top_acc, top_rank = get_top_k_acc_rank(final_acc, gt) topk = (1, 3) for k in topk: report_mean_std_max_min(analysis_dir, logger, "Top-{}-Acc".format(k), top_acc[:, k - 1]) report_mean_std_max_min(analysis_dir, logger, "Top-{}-Rank".format(k), top_rank[:, k - 1]) plot_top_k_variance_chart(os.path.join(analysis_dir, "inst_top_k"), np.arange(len(top_acc)), top_acc, top_rank, gt, topk) # Average final acc avg_acc = np.mean(final_acc, axis=0) np.savetxt(os.path.join(analysis_dir, "average_final_acc.txt"), avg_acc) std_acc = np.std(final_acc, axis=0) np.savetxt(os.path.join(analysis_dir, "std_final_acc.txt"), std_acc)
def FeatureAnalysis(self, issave=True, isfigshow=False): """Feature analysis such as illustrations of the monitored costs, correlations and comparison of system and meter measurements.""" print('monitored cost dataset analysis...') label_fontsize = FIGURE_LBL_FONTSIZE_MAIN dataset = self.dataset.copy() dataset.sort_values(by=['Number of Houses', 'Number of Processes'], ascending=True, inplace=True) dataset["ProcTime (" + r'$\Delta$' + ")" + units['ProcTime_FromMeter']] = dataset[ 'ProcTime_FromMeter'] - dataset['ProcTime_FromSystem'] dataset["ProcTime (" + r'$\Delta$' + ") (%)"] = 100 * dataset["ProcTime (" + r'$\Delta$' + ")" + units['ProcTime_FromMeter']].divide( dataset["ProcTime_FromMeter"], axis="index").copy() if "ProcTime_FromSystem" in self.targets: self.targets.remove("ProcTime_FromSystem") # Correlation analysis label_vars = self.features[:] label_vars.extend(self.targets) train_data = dataset[label_vars] cm = train_data.corr() mask = np.zeros_like(cm, dtype=np.bool) mask[np.triu_indices_from(mask)] = True fig, ax = plt.subplots(figsize=(6, 6)) fmt = '.2f' label_vars = [ var + ' (target)' if var in self.targets else var + ' (feature)' for var in label_vars ] annot_kws = {"size": label_fontsize + 2, "ha": 'center', "va": 'top'} sns.heatmap(cm, annot=True, annot_kws=annot_kws, fmt=fmt, square=True, cmap='coolwarm', mask=mask, ax=ax, linewidths=0.1) plt.xticks(rotation=60, fontsize=label_fontsize) plt.yticks(rotation=0, fontsize=label_fontsize) ax.set_xticklabels(label_vars, fontsize=label_fontsize) ax.set_yticklabels(label_vars, fontsize=label_fontsize) ax.set_xticks(np.arange(0, len(label_vars), 1)) ax.set_yticks(np.arange(0.5, len(label_vars), 1)) ax.tick_params(axis='both', which='major', labelsize=label_fontsize) filename_plot = "{}/correlation_analysis".format(self.figure_path) util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave) # %% Feature analysis dataset['Number of Houses'] = dataset['Number of Houses'].astype('int') dataset['Number of Processes'] = dataset['Number of Processes'].astype( 'int') y_vars = [ "ProcTime (" + r'$\Delta$' + ")" + units['ProcTime_FromMeter'], "ProcTime (" + r'$\Delta$' + ") (%)" ] x_var = 'Number of Processes' # print(dataset[y_vars + [x_var]].groupby([x_var]).describe()) for i, y_var in enumerate(y_vars): fig, ax = plt.subplots() fig = sns.catplot(x=x_var, y=y_var, kind="box", data=dataset, **kws_box_2) plt.xlabel(x_var, fontsize=label_fontsize) plt.ylabel(y_var, fontsize=label_fontsize) y_var2 = y_var if '%' in y_var: y_var2 = y_var.replace("(%)", "_perc") y_var2 = y_var2.replace("$", "").replace("\\", "").replace( "(", "").replace(")", "") plt.tick_params(axis='both', which='major', labelsize=label_fontsize) X, Y = dataset[x_var].values, dataset[y_var].values util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6) filename_plot = "{}/{}_vs_{}".format(self.figure_path, x_var, y_var2) util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave) y_vars = self.targets[:] x_var = 'Number of Processes' hue = 'Number of Houses' for i, y_var in enumerate(y_vars): fig, ax = plt.subplots() fig = sns.catplot(kind="point", x=x_var, y=y_var, data=dataset, hue=hue, linestyles=linestyles, markers=filled_markers, markersize=10, legend=False, **kws_online_2) # plt.legend(fontsize=label_fontsize - 1, frameon=True, framealpha=0.5, title=hue, ncol=2, loc='upper right', bbox_to_anchor=(1, 0.99)) plt.xlabel(x_var, fontsize=label_fontsize) plt.ylabel(y_var + units[y_var], fontsize=label_fontsize) plt.tick_params(axis='both', which='major', labelsize=label_fontsize) X, Y = dataset[x_var].values, dataset[y_var].values util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6) filename_plot = "{}/{}_vs_{}".format(self.figure_path, x_var, y_var) util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave) dataset[y_var + "_puh"] = dataset[y_var].divide( dataset["Number of Houses"], axis="index").copy() fig = sns.catplot(kind="point", x=x_var, y=y_var + "_puh", data=dataset, hue=hue, linestyles=linestyles, markers=filled_markers, markersize=10, legend=False, **kws_online_2) plt.xlabel(x_var, fontsize=label_fontsize) plt.ylabel(y_var + units[y_var], fontsize=label_fontsize) plt.tick_params(axis='both', which='major', labelsize=label_fontsize) plt.legend(fontsize=label_fontsize - 3, frameon=True, framealpha=0.5, title=hue, ncol=2, loc='upper right') X, Y = dataset[x_var].values, dataset[y_var + "_puh"].values util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6) filename_plot = "{}/{}_vs_{}_puh".format(self.figure_path, x_var, y_var) util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave) # print(dataset[[y_var +"_puh" for y_var in y_vars] + [x_var]].groupby([x_var]).describe()) y_vars = [ 'house_energy_mean_kwh', "house_energy_median_kwh", 'house_energy_max_kwh' ] x_var = 'Number of Processes' hue = 'Number of Houses' for i, y_var in enumerate(y_vars): fig = sns.catplot(kind="point", x=x_var, y=y_var, data=dataset, hue=hue, linestyles=linestyles, markers=filled_markers, markersize=10, legend=False, **kws_online_2) plt.xlabel(x_var, fontsize=label_fontsize) plt.ylabel(y_var, fontsize=label_fontsize) plt.tick_params(axis='both', which='major', labelsize=label_fontsize) plt.legend(fontsize=label_fontsize - 3, frameon=True, framealpha=0.5, title=hue, ncol=4, bbox_to_anchor=(1., 1 + 0.21 * 2)) X, Y = dataset[x_var].values, dataset[y_var].values util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6) filename_plot = "{}/{}_vs_{}".format(self.figure_path, x_var, y_var) util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave) # print(dataset[y_vars + [x_var]].groupby([x_var]).describe()) y_vars = ['house_energy_total_kwh'] x_var = 'Number of Processes' hue = 'Number of Houses' for i, y_var in enumerate(y_vars): dataset[y_var + "_puh"] = dataset[y_var].divide( dataset["Number of Houses"], axis="index").copy() fig = sns.catplot(kind="point", x=x_var, y=y_var + "_puh", data=dataset, hue=hue, linestyles=linestyles, markers=filled_markers, markersize=10, legend=False, **kws_online_2) plt.xlabel(x_var, fontsize=label_fontsize) plt.ylabel(y_var, fontsize=label_fontsize) plt.tick_params(axis='both', which='major', labelsize=label_fontsize) plt.legend(fontsize=label_fontsize - 3, frameon=True, framealpha=0.5, title=hue, ncol=4, bbox_to_anchor=(1., 1 + 0.21 * 2)) X, Y = dataset[x_var].values, dataset[y_var + "_puh"].values util.PlotGridSpacing(X, Y, x_gridno=8, y_gridno=6) filename_plot = "{}/{}_vs_{}_puh".format(self.figure_path, x_var, y_var) util.SaveFigure(filename_plot, fig, isshow=isfigshow, issave=issave)
def confounds_correlation_plot(confounds_file, output_file=None, figure=None, reference='global_signal', max_dim=70): """ Parameters ---------- confounds_file: str File containing all confound regressors to be included in the correlation plot. output_file: str or None Path where the output figure should be saved. If this is not defined, then the plotting axes will be returned instead of the saved figure path. figure: figure or None Existing figure on which to plot. reference: str `confounds_correlation_plot` prepares a bar plot of the correlations of each confound regressor with a reference column. By default, this is the global signal (so that collinearities with the global signal can readily be assessed). max_dim: int The maximum number of regressors to be included in the output plot. Reductions (e.g., CompCor) of high-dimensional data can yield so many regressors that the correlation structure becomes obfuscated. This criterion selects the `max_dim` regressors that have the largest correlation magnitude with `reference` for inclusion in the plot. Returns ------- axes and gridspec Plotting axes and gridspec. Returned only if `output_file` is None. output_file: str The file where the figure is saved. """ confounds_data = pd.read_table(confounds_file) confounds_data = confounds_data.loc[:, np.logical_not( np.isclose( confounds_data.var( skipna=True), 0))] corr = confounds_data.corr() np.fill_diagonal(corr.values, 0) gscorr = corr.copy() gscorr['index'] = gscorr.index gscorr[reference] = np.abs(gscorr[reference]) gs_descending = gscorr.sort_values(by=reference, ascending=False)['index'] if corr.shape[0] > max_dim: gs_descending = gs_descending[:max_dim] features = [p for p in corr.columns if p in gs_descending] corr = corr.loc[features, features] n_vars = corr.shape[0] if figure is None: plt.figure(figsize=(3 * n_vars * 0.3, n_vars * 0.3)) gs = mgs.GridSpec(1, 15) ax0 = plt.subplot(gs[0, :7]) ax1 = plt.subplot(gs[0, 7:]) mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True sns.heatmap(corr, linewidths=0.5, cmap='coolwarm', center=0, square=True, ax=ax0) ax0.tick_params(axis='both', which='both', width=0) for tick in ax0.xaxis.get_major_ticks(): tick.label.set_fontsize('small') for tick in ax0.yaxis.get_major_ticks(): tick.label.set_fontsize('small') sns.barplot(data=gscorr, x='index', y=reference, ax=ax1, order=gs_descending, palette='Reds_d', saturation=.5) ax1.set_xlabel('Confound time series') ax1.set_ylabel('Magnitude of correlation with {}'.format(reference)) ax1.tick_params(axis='x', which='both', width=0) ax1.tick_params(axis='y', which='both', width=5, length=5) for tick in ax1.xaxis.get_major_ticks(): tick.label.set_fontsize('small') tick.label.set_rotation('vertical') for tick in ax1.yaxis.get_major_ticks(): tick.label.set_fontsize('small') for side in ['top', 'right', 'left']: ax1.spines[side].set_color('none') ax1.spines[side].set_visible(False) if output_file is not None: figure = plt.gcf() figure.savefig(output_file, bbox_inches='tight') plt.close(figure) figure = None return output_file return [ax0, ax1], gs
def mustache(c, chromosome, chromosome2, res, start, end, mask_size, distance_in_px, octave_values, st, pt): nz = np.logical_and(c != 0, np.triu(c, 4)) if np.sum(nz) < 50: return [] c[np.tril_indices_from(c, 4)] = 2 if chromosome == chromosome2: c[np.triu_indices_from(c, k=(distance_in_px + 1))] = 2 pAll = np.ones_like(c[nz]) * 2 Scales = np.ones_like(pAll) vAll = np.zeros_like(pAll) s = 10 #curr_filter = 1 scales = {} for o in octave_values: scales[o] = {} sigma = o w = 2 * math.ceil(2 * sigma) + 1 t = (((w - 1) / 2) - 0.5) / sigma Gp = gaussian_filter(c, o, truncate=t, order=0) scales[o][1] = sigma sigma = o * 2**((2 - 1) / s) w = 2 * math.ceil(2 * sigma) + 1 t = (((w - 1) / 2) - 0.5) / sigma Gc = gaussian_filter(c, sigma, truncate=t, order=0) scales[o][2] = sigma Lp = Gp - Gc Gp = [] sigma = o * 2**((3 - 1) / s) w = 2 * math.ceil(2 * sigma) + 1 t = (((w - 1) / 2) - 0.5) / sigma Gn = gaussian_filter(c, sigma, truncate=t, order=0) scales[o][3] = sigma #Lp = Gp - Gc Lc = Gc - Gn locMaxP = maximum_filter(Lp, footprint=np.ones((3, 3)), mode='constant') locMaxC = maximum_filter(Lc, footprint=np.ones((3, 3)), mode='constant') for i in range(3, s + 2): #curr_filter += 1 Gc = Gn sigma = o * 2**((i) / s) w = 2 * math.ceil(2 * sigma) + 1 t = ((w - 1) / 2 - 0.5) / sigma Gn = gaussian_filter(c, sigma, truncate=t, order=0) scales[o][i + 1] = sigma Ln = Gc - Gn dist_params = expon.fit(np.abs(Lc[nz])) pval = 1 - expon.cdf(np.abs(Lc[nz]), *dist_params) locMaxN = maximum_filter(Ln, footprint=np.ones((3, 3)), mode='constant') willUpdate = np.logical_and \ .reduce((Lc[nz] > vAll, Lc[nz] == locMaxC[nz], np.logical_or(Lp[nz] == locMaxP[nz], Ln[nz] == locMaxN[nz]), Lc[nz] > locMaxP[nz], Lc[nz] > locMaxN[nz])) vAll[willUpdate] = Lc[nz][willUpdate] Scales[willUpdate] = scales[o][i] pAll[willUpdate] = pval[willUpdate] Lp = Lc Lc = Ln locMaxP = locMaxC locMaxC = locMaxN pFound = pAll != 2 if len(pFound) < 10000: return [] _, pCorrect, _, _ = multipletests(pAll[pFound], method='fdr_bh') pAll[pFound] = pCorrect o = np.ones_like(c) o[nz] = pAll sig_count = np.sum(o < pt) #change x, y = np.unravel_index(np.argsort(o.ravel()), o.shape) so = np.ones_like(c) so[nz] = Scales x = x[:sig_count] y = y[:sig_count] xyScales = so[x, y] nonsparse = x != 0 for i in range(len(xyScales)): s = math.ceil(xyScales[i]) c1 = np.sum(nz[x[i]-s:x[i]+s+1, y[i]-s:y[i]+s+1]) / \ ((2*s+1)**2) s = 2 * s c2 = np.sum(nz[x[i]-s:x[i]+s+1, y[i]-s:y[i]+s+1]) / \ ((2*s+1)**2) if c1 < st or c2 < 0.6: nonsparse[i] = False x = x[nonsparse] y = y[nonsparse] if len(x) == 0: return [] def nz_mean(vals): return np.mean(vals[vals != 0]) def diag_mean(k, map): return nz_mean(map[kth_diag_indices(map, k)]) if chromosome == chromosome2: means = np.vectorize(diag_mean, excluded=['map'])(k=y - x, map=c) passing_indices = c[x, y] > 2 * means #change if len(passing_indices) == 0 or np.sum(passing_indices) == 0: return [] x = x[passing_indices] y = y[passing_indices] label_matrix = np.zeros((np.max(y) + 2, np.max(y) + 2), dtype=np.float32) label_matrix[x, y] = o[x, y] + 1 label_matrix[x + 1, y] = 2 label_matrix[x + 1, y + 1] = 2 label_matrix[x, y + 1] = 2 label_matrix[x - 1, y] = 2 label_matrix[x - 1, y - 1] = 2 label_matrix[x, y - 1] = 2 label_matrix[x + 1, y - 1] = 2 label_matrix[x - 1, y + 1] = 2 num_features = scipy_measurements.label(label_matrix, output=label_matrix, structure=np.ones((3, 3))) out = [] for label in range(1, num_features + 1): indices = np.argwhere(label_matrix == label) i = np.argmin(o[indices[:, 0], indices[:, 1]]) _x, _y = indices[i, 0], indices[i, 1] out.append([_x + start, _y + start, o[_x, _y], so[_x, _y]]) return out
def __init__( self, num_environment, num_agents, num_managers, innoise, outnoise, fanout, envnoise, envobsnoise, #statedim, batchsize, optimizer, env_input, env_pattern_input=None, agent_type="sigmoid", agent_order='linear', network_type=None, network_prespecified_input=None, network_update_method=None, dropout_rate=0.0, dropout_type='AllIn', L1_norm=.0, weight_on_cost=0., weight_update=False, initializer_type='zeros', dunbar_number=2, dunbar_function='linear_kth', randomSeed=False, decay=None, tensorboard_filename=None, **kwargs): self.sess = tf.Session() #For Debug self.task_loss_list = [] self.cost_loss_list = [] self.total_loss_list = [] self.num_environment = num_environment self.num_agents = num_agents if num_managers is "AllButOne": self.num_managers = num_agents - 1 else: self.num_managers = num_managers self.agent_order = agent_order self.batchsize = batchsize self.envobsnoise = envobsnoise self.agents = [] for i in range(num_agents): self.agents.append( Agent(innoise, outnoise, i, fanout, batchsize, num_agents, num_environment, dunbar_number, initializer_type=initializer_type)) #, statedim self.env_input = env_input self.env_pattern_input = env_pattern_input with tf.name_scope("Environment"): if env_pattern_input is None: self.environment = tf.random_normal( [self.batchsize, num_environment], mean=0.0, stddev=1.0, dtype=tf.float32) zero = tf.convert_to_tensor(0.0, tf.float32) greater = tf.greater(self.environment, zero, name="Organization_greater") self.environment = tf.where(greater, tf.ones_like(self.environment), tf.zeros_like(self.environment), name="where_env") else: self.environment = tf.placeholder( tf.float32, shape=[self.batchsize, self.num_environment]) self.env_pattern = tf.placeholder(tf.float32, shape=[self.batchsize, 1]) with tf.name_scope('Network_prespecified'): #num_environment+num_agents times num_agents matrix of binary #Includes environment, but not bias self.network_prespecified = tf.placeholder( tf.float32, shape=[ self.num_environment + self.num_agents, self.num_agents ]) if network_prespecified_input is None: #all the edges are possible temp = np.zeros( [self.num_environment + self.num_agents, self.num_agents]) temp[np.triu_indices_from(temp, k=-self.num_environment)] = 1. self.network_prespecified_input = temp #self.network_prespecified_input = np.ones([self.num_environment+self.num_agents,self.num_agents]) else: self.network_prespecified_input = network_prespecified_input self.network_update_method = network_update_method self.dropout_rate = dropout_rate self.dropout_type = dropout_type self.L1_norm = L1_norm if weight_update is False: self.weight_on_cost = tf.constant( weight_on_cost, dtype=tf.float32 ) #the weight on the listening cost in loss function self.weight_on_cost_val = weight_on_cost elif weight_update is True: self.weight_on_cost = tf.get_variable(name="weight_on_cost", dtype=tf.float32, initializer=tf.constant( weight_on_cost, dtype=tf.float32), trainable=False) self.weight_on_cost_val = weight_on_cost self.assign_weight = tf.assign(self.weight_on_cost, self.weight_on_cost_val) self.weight_update = weight_update self.dunbar_number = dunbar_number #Dunbar number self.dunbar_function = dunbar_function self.build_org() with tf.name_scope("Objective"): self.objective_task = self._make_loss_task() self.objective_cost = self._make_loss_cost() self.objective_L1 = self._make_loss_L1() # self.objective = self.loss() self.objective = self.weight_on_cost * self.objective_cost + ( 1 - self.weight_on_cost) * self.objective_task + self.objective_L1 with tf.name_scope("Optimizer"): self.learning_rate = tf.placeholder(tf.float32) #self.optimize =tf.train.AdadeltaOptimizer(self.learning_rate, rho=.9).minimize(self.objective) self.optimize = tf.train.AdamOptimizer( self.learning_rate).minimize(self.objective) #self.optimize =tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.objective) self.start_learning_rate = .1 #15. self.decay = decay #None #.01 if (tensorboard_filename == None): self.writer = None else: self.writer = tf.summary.FileWriter(tensorboard_filename, self.sess.graph) self.saver = tf.train.Saver() merged = tf.summary.merge_all() init = tf.global_variables_initializer() self.sess.run(init)
data_df.plot(kind='scatter', x='CRIM', y='PRICE', ax=axs[0], figsize=(16, 8)) data_df.plot(kind='scatter', x='LSTAT', y='PRICE', ax=axs[1]) data_df.plot(kind='scatter', x='AGE', y='PRICE', ax=axs[2]) # Correlation measures the strength of the linear relationship between two independent variables sns.set(style="white") # Compute the correlation matrix corr = data_df.corr() # Generate a mask for the upper triangle mask = np.zeros_like( corr, dtype=np.bool ) # Return an array of zeros with the same shape and type as a given array mask[np.triu_indices_from( mask)] = True # Return the indices for the upper-triangle of arr # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5,
def scores(key, paths, config): values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["prob_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits))] auc_splits = [roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits))] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1,alternative = 'greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1,alternative = 'greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5,alternative = 'greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5,alternative = 'greater') pvalue_bacc = binom_test(success[0]+success[1], s[0] + s[1], p=0.5,alternative = 'greater') # Beta's measures of similarity betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0])]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i+1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append(float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores['pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores['pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() return scores
def GammaGamma_Connectome_thresholding_pFDR(input_file, toolbox_path): #Add the toolbox to path #toolbox_path = "/Users/alblle/allera_version_controlled_code/One_Dim_Mixture_Models/python/code" sys.path.append(os.path.join(os.path.abspath(toolbox_path))) from Mixture_Model_1Dim import Mixture_Model_1Dim #load input conenctivity matrix #input_file="/Users/alblle/Dropbox/POSTDOC/Demetrius/dmn_non_normalized.csv" #connectivity_matrix = np.loadtxt(input_file, delimiter=',')#, skiprows=1,skipcolumns=1) connectivity_matrix = np.genfromtxt(input_file, delimiter=',') #get updiagonal terms updiag_idx = np.triu_indices_from(connectivity_matrix, k=1) orig_data_vector = connectivity_matrix[updiag_idx] orig_data_vector = orig_data_vector[~np.isnan(orig_data_vector)] data_vector = orig_data_vector[orig_data_vector > 0.05] scaling_factor = np.mean(data_vector) data_vector = np.divide(data_vector, scaling_factor) #Define options for the mixture model fit Inference = 'Variational Bayes' #'Method of moments' OR 'Maximum Likelihood' OR 'Variational Bayes' ML NOT INCLUDED YET Number_of_Components = 2 Components_Model = [ 'Gamma', 'InvGamma' ] #,'-Gamma'] #Each component can be Gauss, Gamma, InvGamma, -Gamma, -InvGamma maxits = 500 tol = 0.00001 good_model = 0 percentiles = np.array([99, 98.5, 98, 97.5, 97, 96.5, 96, 95.5, 95]) percentile_idx = -1 while good_model == 0: percentile_idx = percentile_idx + 1 tail = np.percentile(data_vector, percentiles[percentile_idx]) init_params = [1, 2, tail, 2] #,-5,2] opts = { 'Inference': Inference, 'Number_of_Components': Number_of_Components, 'Components_Model': Components_Model, 'init_params': init_params, 'maxits': maxits, 'tol': tol } #Define options for the mixture model fit # CALL TO FIT MIXTURE MODEL Model = Mixture_Model_1Dim(data_vector, opts) #if Model['Mixing Prop.'][0]<.95: good_model = 1 # CALL TO FIT MIXTURE MODEL if 1: # Plot the resulting fit on a histogram of the data from alb_MM_functions import gam my_range = np.linspace(0.01, np.max(data_vector), 10000) plt1 = np.multiply( Model['Mixing Prop.'][0], gam(my_range, Model['shapes'][0], np.divide(1, Model['rates'][0]))) plt2 = np.multiply( Model['Mixing Prop.'][1], gam(my_range, Model['shapes'][1], np.divide(1, Model['rates'][1]))) import matplotlib.pyplot as plt plt.hist(data_vector, bins=50, density=True, alpha=1, color='g') plt.plot(my_range, plt1, 'k', linewidth=2) plt.plot(my_range, plt2, 'k', linewidth=2) plt.plot(my_range, plt1 + plt2, 'r', linewidth=2) plt.show() # Plot the resulting fit on a histogram of the data #Compute local FDR p0 = Model['Mixing Prop.'][0] #f0(x)=gam(x,Model['shapes'][0],np.divide(1,Model['rates'][0]))) rho = data_vector.shape[0] sorted_data_vector = -np.sort(-data_vector) all_localFDR = np.ones(rho) flag = 0 k = -1 while flag == 0: k = k + 1 point = sorted_data_vector[k] cdf = scipy.stats.gamma.cdf(point, Model['shapes'][0], 0, np.divide(1., Model['rates'][0])) numerator = np.multiply(float(p0), 1 - cdf) denominator = np.divide(float(k + 1), float(rho)) all_localFDR[k] = np.divide(numerator, denominator) pFDR = all_localFDR[k] if pFDR > 0.05: threshold = np.multiply(sorted_data_vector[k - 1], scaling_factor) flag = 1 print threshold return threshold, Model
def takeAverageList(): result = [] for i in range(channels): result.append(corr.values[np.triu_indices_from(corr.values, i)].mean()) # print(str(i + 1) + ' ' + str(corr.values[np.triu_indices_from(corr.values, i)].mean())) return result
def run_ttests(narps, logfile, overwrite=True): masker = nilearn.input_data.NiftiMasker(mask_img=narps.dirs.MNI_mask) results_dir = narps.dirs.dirs['consensus'] func_name = sys._getframe().f_code.co_name log_to_file(logfile, '%s' % func_name) if not os.path.exists(results_dir): os.mkdir(results_dir) for hyp in hypnums: if not overwrite and os.path.exists( os.path.join(results_dir, 'hypo%d_1-fdr.nii.gz' % hyp)): print('using existing results') continue print('running consensus analysis for hypothesis', hyp) maps = glob.glob( os.path.join(narps.dirs.dirs['output'], 'zstat/*/hypo%d_unthresh.nii.gz' % hyp)) maps.sort() data = masker.fit_transform(maps) # get estimated mean, variance, and correlation for t_corr img_mean = numpy.mean(data) img_var = numpy.mean(numpy.var(data, 1)) cc = numpy.corrcoef(data) log_to_file( logfile, 'mean = %f, var = %f, mean_cc = %f' % (img_mean, img_var, numpy.mean(cc[numpy.triu_indices_from(cc, 1)]))) # perform t-test tvals, pvals = t_corr(data, res_mean=img_mean, res_var=img_var, Q=cc) # move back into image format timg = masker.inverse_transform(tvals) timg.to_filename(os.path.join(results_dir, 'hypo%d_t.nii.gz' % hyp)) pimg = masker.inverse_transform(1 - pvals) pimg.to_filename(os.path.join(results_dir, 'hypo%d_1-p.nii.gz' % hyp)) fdr_results = multipletests(pvals[0, :], 0.05, 'fdr_tsbh') log_to_file( logfile, "%d voxels significant at FDR corrected p<.05" % numpy.sum(fdr_results[0])) fdrimg = masker.inverse_transform(1 - fdr_results[1]) fdrimg.to_filename( os.path.join(results_dir, 'hypo%d_1-fdr.nii.gz' % hyp)) # compute tau^2 per Tom's notes in CorrelatedMetaNotes.html def tau(data, Q): n = data.shape[0] R = numpy.eye(n) - numpy.ones((n, 1)).dot(numpy.ones((1, n))) / n sampvar_est = numpy.trace(R.dot(Q)) tau2 = numpy.zeros(data.shape[1]) for i in range(data.shape[1]): Y = data[:, i] tau2[i] = (1 / sampvar_est) * Y.T.dot(R).dot(Y) return (numpy.sqrt(tau2)) tau_est = tau(data, cc) tauimg = masker.inverse_transform(tau_est) tauimg.to_filename(os.path.join(results_dir, 'hypo%d_tau.nii.gz' % hyp))
def fill_off_diagonal(x, radius, value=0): """Sets all cells of a matrix to a given ``value`` if they lie outside a constraint region. In this case, the constraint region is the Sakoe-Chiba band which runs with a fixed ``radius`` along the main diagonal. When ``x.shape[0] != x.shape[1]``, the radius will be expanded so that ``x[-1, -1] = 1`` always. ``x`` will be modified in place. Parameters ---------- x : np.ndarray [shape=(N, M)] Input matrix, will be modified in place. radius : float The band radius (1/2 of the width) will be ``int(radius*min(x.shape))``. value : int ``x[n, m] = value`` when ``(n, m)`` lies outside the band. Examples -------- >>> x = np.ones((8, 8)) >>> global_constraints(x, 0.25) >>> x array([[1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1]]) >>> x = np.ones((8, 12)) >>> global_constraints(x, 0.25) >>> x array([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) """ nx, ny = x.shape # Calculate the radius in indices, rather than proportion radius = np.round(radius * np.min(x.shape)) nx, ny = x.shape offset = np.abs((x.shape[0] - x.shape[1])) if nx < ny: idx_u = np.triu_indices_from(x, k=radius + offset) idx_l = np.tril_indices_from(x, k=-radius) else: idx_u = np.triu_indices_from(x, k=radius) idx_l = np.tril_indices_from(x, k=-radius - offset) # modify input matrix x[idx_u] = value x[idx_l] = value
def corner_beautiful_plot(data,bestfit,split,bins=50,labels=None,interpolation='nearest',cmap=plt.cm.gray,show=True): """ Create a croner plot from a pandas dataframe input. ___ INPUT: data: pandas dataframe with N columns bestfit: best fit values to mark in the corner plot as horizontal and vertical lines with N-values in a numpy array split: boolean array with N-elements. This set if you want only the decimal part of the values. Because this plot is thinking in exoplanetary transits, this was created to remove the int part of the Julian Date bins: the number of bins that you whant to show at the 2D-histogram of the data at the lower subplots of the corner plot. Default is 50 bins. labels: labels of each column of the corner plot. N-list of strings. Default is None, and the code will use the names of the columns in the pandas dataframe input. interpolation: string, interpolation of the 2D-histogram. Default is 'nearest'. Possible options are 'none', 'bilinear', 'bicubic', 'spline16', 'spline36', 'hanning', 'hamming', 'hermite', 'kaiser', 'quadric', 'catrom', 'gaussian', 'bessel', 'mitchell', 'sinc','lanczos'. cmap: `~matplotlib.colors.Colormap`, optional, default: 'hot'. If None, cmap to rc `image.cmap` value. `cmap` is ignored when `X` has RGB(A) information. show: boolean value: True or False. Default is True. If default, then the function will show information for each step. """ #Setting the plot default parameters: def init_plotting2(): plt.rcParams['figure.figsize'] = (14.0,14.0) plt.rcParams['font.size'] = 14 #plt.rcParams['font.family'] = 'Times New Roman' plt.rcParams['axes.labelsize'] = plt.rcParams['font.size'] plt.rcParams['axes.titlesize'] = 2*plt.rcParams['font.size'] plt.rcParams['legend.fontsize'] = 0.65*plt.rcParams['font.size'] plt.rcParams['xtick.labelsize'] = plt.rcParams['font.size'] plt.rcParams['ytick.labelsize'] = plt.rcParams['font.size'] plt.rcParams['xtick.major.size'] = 3 plt.rcParams['xtick.minor.size'] = 3 plt.rcParams['xtick.major.width'] = 1 plt.rcParams['xtick.minor.width'] = 1 plt.rcParams['ytick.major.size'] = 3 plt.rcParams['ytick.minor.size'] = 3 plt.rcParams['ytick.major.width'] = 1 plt.rcParams['ytick.minor.width'] = 1 plt.rcParams['legend.frameon'] = True plt.rcParams['legend.loc'] = 'best' plt.rcParams['axes.linewidth'] = 1 init_plotting2() #initiallizing the plot parameters #import the shape of the dataframe input to a variable to use #for obtain the data and set the size of the corner plot shape = data.shape #[0]rows, [1]columns #If split have some value equal to True, than the exactly column #will have the int part removed. for k in range(shape[1]): if split[k] == True: data.iloc[:,k] = np.modf(data.iloc[:,k].values)[0] #creating the corner plot structure: #f: figure output #axarr: array-plot, matrix plot with the size of the columns of dataframe per #columns of the dataframe N X N plots f, axarr = plt.subplots(shape[1], shape[1])#, sharex=True)#, sharey=True) #if show == True, this routine will show the information for each step before starting #the porcedure to create the plot i X j. if show == True: print('Creating corner plot, shape = ',shape[1],' per ',shape[1]) #Creating the plot i X j for i in range(shape[1]): for j in range(shape[1]): #information about what type of plot will be created #If i ==j, then will be created a histogram of the i-column at the #pandas dataframe input. #If i !=j, then will be created a numpy.histogram2d variable, and with that #we will use the output as a image to be treated as plt.imshow function if show == True: print('Subplot = ',i,j) if i == j: print('Histogram of ',i) else: print('Density plot of ',i,' per ',j) # i == j, Diagonal plots: Hitograms plots from i-columns at dataframe input. if i == j: #remove histogram grid. axarr[i][j].grid() #create the numpy.histogram variable #normed=True to match with the plt.imshow image #bins setting to be at the sqrt-scale, sqrt(len(i-column)), to show #appropriated scale of the data set. H,bins_edges = np.histogram(data.iloc[:,i].values,normed=True,bins='sqrt') #plot the histogram at same conditions of the numpy.histogram H variable axarr[i][j].hist(data.iloc[:,i].values,normed=True,bins='sqrt') #plot a vertical line of the bestfit i-value to zero to maximum #of the H-variable and set the x-limits and y-limits axarr[i][j].vlines(bestfit[i],0,H.max(),color='red') axarr[i][j].set_ylim(0,H.max()) axarr[i][j].set_xlim(data.iloc[:,i].min(),data.iloc[:,i].max()) #setting the text with the value of i-bestfit rounded with 4-decimal numbers #color setting match with the vertical line axarr[i][j].text(bestfit[i],H.mean(),str(round(bestfit[i],4)),color='red') #remove y-axis to clean the corner plot with no-needed information axarr[i][j].get_yaxis().set_visible(False) #remove x-ticks if the plots are not max(j) per max(j). #Those plots represent histograms that are above others plots #Else, the x-ticks will be created, with the name of the column in the pandas #dataframe input or if labels are given, those will be used. if labels == None: if i == int(shape[1]-1): axarr[i][j].set_xlabel(data.columns[i]) else: if i == int(shape[1]-1): axarr[i][j].set_xlabel(labels[i]) if i != int(shape[1]-1): axarr[i][j].get_xaxis().set_visible(False) if i == int(shape[1]-1): axarr[i][j].get_xaxis().set_visible(True) axarr[i][j].locator_params(axis='x',nbins=6) #i != j plots: image plot from plt.imshow #We will map the numpy.histogram2d from the j-column (x-axis) per i-column (yaxis) #The change of j to x-axis and i to y-axis is because to mach the x-label of the #histograms in the diagonal plots with the images plots. So, this will set #correctly the corner axis. else: #Create the numpy.histogram2d object (image) H from the data set j-column per #i-column, and set the x- and y-tiks labels range xedges and yedges, respectively. #We will use the number of bins given from the input parameters. The default is #bins = 50. H, xedges, yedges = np.histogram2d(data.iloc[:,j].values,data.iloc[:,i].values,bins=bins) #Setting the minimum and maximum of each x- and y-ticks, rounded with 4-decimal. xmin, xmax = round(xedges.min(),4),round(xedges.max(),4) ymin, ymax = round(yedges.min(),4),round(yedges.max(),4) #remove the grid of the image. axarr[i][j].grid(b=False) #plotting the image H with cmap and interpolation given by input parameters. #the minimum and maximum of the map is set to be the difference between #the mean value of H with plus or the difference with standart deviation of H. #The aspect is set to be auto to adjust the box of the image with the x- and #y-ticks. axarr[i][j].imshow(H,cmap=cmap,origin='lower', extent=[xmin,xmax,ymin,ymax],aspect='auto', vmin=np.mean(H)-np.std(H),vmax=np.mean(H)+np.std(H), interpolation=interpolation) #Make the countour plot of H. axarr[i][j].contour(H,origin='lower',extent=[xmin,xmax,ymin,ymax]) #Setting the box limits to match with the histograms plots at the diagonal axarr[i][j].set_xlim(xmin,xmax) axarr[i][j].set_ylim(ymin,ymax) #plot the vertical and horizontal lines with the values of the bestfit i and j #values, with red color. axarr[i][j].hlines(bestfit[i],xmin,xmax,color='red') axarr[i][j].vlines(bestfit[j],ymin,ymax,color='red') #set the aspect of the box x- and y-ticks to auto too to match with the #plt.imshow parameter. axarr[i][j].set_aspect('auto') #Fix the number of values at the array to clean the x- and y-ticks. #here, I choose to make only 6 bins, and this will give 5-ticks at maximum axarr[i][j].locator_params(axis='x',nbins=6) axarr[i][j].locator_params(axis='y',nbins=6) #Set the labels of each plot. If the labels are given as one of the input #parameters, these will be used instead of the names of the pandas #dataframe input if labels == None: axarr[i][j].set_ylabel(data.columns[i]) axarr[i][j].set_xlabel(data.columns[j]) else: axarr[i][j].set_ylabel(labels[i]) axarr[i][j].set_xlabel(labels[j]) #If the plot is between two other plots, #the y-ticks or the x-ticks will be removed. if (j > 0): axarr[i][j].get_yaxis().set_visible(False) if (i < shape[1]-1): axarr[i][j].get_xaxis().set_visible(False) #Remove the upper-plots of the diagonal plots #this create a triangle plot for i, j in zip(*np.triu_indices_from(axarr, 1)): axarr[i, j].set_visible(False) return f
def heatmap(df, inline=True, filter=None, n=0, p=0, sort=None, figsize=(20, 12), fontsize=16, labels=True, cmap='RdBu'): """ Presents a `seaborn` heatmap visualization of nullity correlation in the given DataFrame. Note that this visualization has no special support for large datasets. For those, try the dendrogram instead. :param df: The DataFrame whose completeness is being heatmapped. :param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default). See `nullity_filter()` for more information. :param n: The cap on the number of columns to include in the filtered DataFrame. See `nullity_filter()` for more information. :param p: The cap on the percentage fill of the columns in the filtered DataFrame. See `nullity_filter()` for more information. :param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None. See `nullity_sort()` for more information. :param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to (20, 12). :param fontsize: The figure's font size. :param labels: Whether or not to label each matrix entry with its correlation (default is True). :param cmap: What `matplotlib` colormap to use. Defaults to `RdBu`. :return: Returns the underlying `matplotlib.figure` object. """ # Apply filters and sorts. df = nullity_filter(df, filter=filter, n=n, p=p) df = nullity_sort(df, sort=sort) # Set up the figure. fig = plt.figure(figsize=figsize) gs = gridspec.GridSpec(1, 1) ax0 = plt.subplot(gs[0]) # Pre-processing: remove completely filled or completely empty variables. df = df[[ i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0 ]] # Create and mask the correlation matrix. corr_mat = df.isnull().corr() # corr_mat = corr_mat.replace(np.nan, 1) # corr_mat[np.isnan(corr_mat)] = 0 mask = np.zeros_like(corr_mat) mask[np.triu_indices_from(mask)] = True # Set fontsize. # fontsize = _set_font_size(fig, df, fontsize) # Construct the base heatmap. if labels: sns.heatmap(corr_mat, mask=mask, cmap=cmap, ax=ax0, cbar=False, annot=True, annot_kws={"size": fontsize - 2}) else: sns.heatmap(corr_mat, mask=mask, cmap=cmap, ax=ax0, cbar=False) # Apply visual corrections and modifications. ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(), rotation=45, ha='left', fontsize=fontsize) ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(), fontsize=fontsize, rotation=0) ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(), rotation=0, fontsize=fontsize) ax0.xaxis.tick_top() ax0.patch.set_visible(False) # Fix up annotation label rendering. for text in ax0.texts: t = float(text.get_text()) if 0.95 <= t < 1: text.set_text("<1") elif -1 < t <= -0.95: text.set_text(">-1") elif t == 1: text.set_text("1") elif t == -1: text.set_text("-1") elif -0.05 < t < 0.05: text.set_text("") else: text.set_text(round(t, 1)) if inline: plt.show() else: return fig
def scatterplot_matrix(data, names, min_max_range, originaldf, **kwargs): numvars, numdata = data.shape #data = data.as_matrix() #data = data.astype(str) fig, axes = plt.subplots(nrows=len(names), ncols=len(names), figsize=(25, 25), sharex=False, sharey=False) fig.subplots_adjust(hspace=0.1, wspace=0.1) cmap = cm.get_cmap('Paired') norm = plt.Normalize() professorsArray = np.array(originaldf['professor_cat']) marker = np.array(originaldf['professor_cat']) face = [] for i in range(1, len(professorsArray)): marker[i] = marker[i] * 20 face.append(i) colors = cmap(norm(professorsArray)) lines = [] for ax in axes.flat: # Hide all ticks and labels # Set up ticks only on one side for the "edge" subplots... if ax.is_first_col(): ax.yaxis.set_ticks_position('left') elif ax.is_last_col(): ax.yaxis.set_ticks_position('right') else: ax.yaxis.set_visible(False) #ax.xaxis.set_visible(False) if ax.is_first_row(): #ax.yaxis.set_visible(False) ax.xaxis.set_ticks_position('top') elif ax.is_last_row(): ax.xaxis.set_ticks_position('bottom') else: ax.xaxis.set_visible(False) # Plot the data. k = 0 for i, j in zip(*np.triu_indices_from(axes, k=1)): for x, y in [(i, j), (j, i)]: rowval = names[y] colval = names[x] x_min_val, x_max_val, x_val_range = min_max_range[rowval] y_min_val, y_max_val, y_val_range = min_max_range[colval] x_arr = data[rowval] y_arr = data[colval] x_ticks = 5 y_ticks = 5 x_step = x_val_range / float(x_ticks - 1) y_step = y_val_range / float(y_ticks - 1) x_tick_labels = [ round(x_min_val + x_step * i, 2) for i in range(x_ticks) ] y_tick_labels = [ round(y_min_val + y_step * i, 2) for i in range(y_ticks) ] x_norm_min = data[rowval].min() y_norm_min = data[colval].min() x_norm_range = np.ptp(data[rowval]) y_norm_range = np.ptp(data[colval]) x_norm_step = x_norm_range / float(x_ticks - 1) y_norm_step = y_norm_range / float(y_ticks - 1) x_ticks = [ round(x_norm_min + x_norm_step * i, 2) for i in range(x_ticks) ] y_ticks = [ round(y_norm_min + y_norm_step * i, 2) for i in range(y_ticks) ] if rowval == 'professor_cat': j = 0 for val in x_tick_labels: val = int(val) professors = np.unique(originaldf['professor']) x_tick_labels[j] = professors[val] j = j + 1 if colval == 'professor_cat': j = 0 for val in y_tick_labels: val = int(val) professors = np.unique(originaldf['professor']) y_tick_labels[j] = professors[val] j = j + 1 if rowval == 'lecture_cat': j = 0 for val in x_tick_labels: val = int(val) lectures = np.unique(originaldf['lecture']) x_tick_labels[j] = lectures[val] j = j + 1 if colval == 'lecture_cat': j = 0 for val in y_tick_labels: val = int(val) lectures = np.unique(originaldf['lecture']) y_tick_labels[j] = lectures[val] j = j + 1 axes[x, y].yaxis.set_ticks(y_ticks) axes[x, y].xaxis.set_ticks(x_ticks) axes[x, y].set_xticklabels(x_tick_labels, rotation=75) axes[x, y].set_yticklabels(y_tick_labels) axes[x, y].scatter(x_arr, y_arr, c=colors, label=np.array(originaldf['professor'])) k = k + 1 #plt.setp(axes[x, y].get_xticklabels(), rotation=30, horizontalalignment='right') # Label the diagonal subplots... for i, label in enumerate(names): if label == 'professor_cat': label = 'professor' if label == 'lecture_cat': label = 'lecture' axes[i, i].annotate(label, (0.5, 0.5), xycoords='axes fraction', ha='center', va='center') axes[i, i].xaxis.set_visible(False) axes[i, i].yaxis.set_visible(False) # Turn on the proper x or y axes ticks. #handles, labels = axes[x, y].data.values(),axes[x, y].data.values() #fig.legend(colors,np.unique(professors)) plt.savefig('scattermatrix.png', dpi=72) plt.show()
def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", metric_params={}, weights=None, weight_params=None, collapse_edges=False, n_perm=None): """Compute persistence diagrams for X data array using Ripser [1]_. If X is not a distance matrix, it will be converted to a distance matrix using the chosen metric. Parameters ---------- X : ndarray of shape (n_samples, n_features) A numpy array of either data or distance matrix. Can also be a sparse distance matrix of type scipy.sparse maxdim : int, optional, default: ``1`` Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value. For 1, H_0 and H_1 will be computed. thresh : float, optional, default: ``numpy.inf`` Maximum distances considered when constructing filtration. If ``numpy.inf``, compute the entire filtration. coeff : int prime, optional, default: ``2`` Compute homology with coefficients in the prime field Z/pZ for p=coeff. metric : string or callable, optional, default: ``'euclidean'`` The metric to use when calculating distance between instances in a feature array. If set to ``'precomputed'``, input data is interpreted as a distance matrix or of adjacency matrices of a weighted undirected graph. If a string, it must be one of the options allowed by :func:`scipy.spatial.distance.pdist` for its metric parameter, or a or a metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, including ``'euclidean'``, ``'manhattan'`` or ``'cosine'``. If a callable, it should take pairs of vectors (1D arrays) as input and, for each two vectors in a pair, it should return a scalar indicating the distance/dissimilarity between them. metric_params : dict, optional, default: ``{}`` Additional parameters to be passed to the distance function. weights : ``"DTM"``, ndarray or None, optional, default: ``None`` If not ``None``, the persistence of a weighted Vietoris-Rips filtration is computed as described in [3]_, and this parameter determines the vertex weights in the modified adjacency matrix. ``"DTM"`` denotes the empirical distance-to-measure function defined, following [3]_, by .. math:: w(x) = 2\\left\\(\\frac{1}{n+1} \\sum_{k=1}^n \\mathrm{dist}(x, x_k)^r \\right)^{1/r}. Here, :math:`\\mathrm{dist}` is the distance metric used, :math:`x_k` is the :math:`k`-th :math:`\\mathrm{dist}`-nearest neighbour of :math:`x` (:math:`x` is not considered a neighbour of itself), :math:`n` is the number of nearest neighbors to include, and :math:`r` is a parameter (see `weight_params`). If an ndarray is passed, it is interpreted as a user-defined list of vertex weights for the modified adjacency matrix. In either case, the edge weights :math:`\\{w_{ij}\\}_{i, j}` for the modified adjacency matrix are computed from the original distances and the new vertex weights :math:`\\{w_i\\}_i` as follows: .. math:: w_{ij} = \\begin{cases} \\max\\{ w_i, w_j \\} &\\text{if } 2\\mathrm{dist}_{ij} \\leq |w_i^p - w_j^p|^{\\frac{1}{p}} \\ t &\\text{otherwise} \\end{cases} where :math:`t` is the only positive root of .. math:: 2 \\mathrm{dist}_{ij} = (t^p - w_i^p)^\\frac{1}{p} + (t^p - w_j^p)^\\frac{1}{p} and :math:`p` is a parameter specified in `metric_params`. weight_params : dict or None, optional, default: ``None`` Parameters to be used in the case of weighted filtrations, see `weights`. In this case, the key ``"p"`` determines the power to be used in computing edge weights from vertex weights. It can be one of ``1``, ``2`` or ``np.inf`` and defaults to ``1``. If `weights` is ``"DTM"``, the additional keys ``"r"`` (default: ``2``) and ``"n_neighbors"`` (default: ``3``) are available (see `weights`, where the latter corresponds to :math:`n`). collapse_edges : bool, optional, default: ``False`` Whether to use the edge collapse algorithm as described in [2]_ prior to calling ``ripser``. n_perm : int or None, optional, default: ``None`` The number of points to subsample in a "greedy permutation", or a furthest point sampling of the points. These points will be used in lieu of the full point cloud for a faster computation, at the expense of some accuracy, which can be bounded as a maximum bottleneck distance to all diagrams on the original point set. Returns ------- A dictionary holding all of the results of the computation { 'dgms': list (size maxdim) of ndarray (n_pairs, 2) A list of persistence diagrams, one for each dimension less than maxdim. Each diagram is an ndarray of size (n_pairs, 2) with the first column representing the birth time and the second column representing the death time of each pair. 'num_edges': int The number of edges added during the computation 'dperm2all': None or ndarray (n_perm, n_samples) ``None`` if n_perm is ``None``. Otherwise, the distance from all points in the permutation to all points in the dataset. 'idx_perm': ndarray(n_perm) if n_perm > 0 Index into the original point cloud of the points used as a subsample in the greedy permutation 'r_cover': float Covering radius of the subsampled points. If n_perm <= 0, then the full point cloud was used and this is 0 } Notes ----- `Ripser <https://github.com/Ripser/ripser>`_ is used as a C++ backend for computing Vietoris–Rips persistent homology. Python bindings were modified for performance from the `ripser.py <https://github.com/scikit-tda/ripser.py>`_ package. `GUDHI <https://github.com/GUDHI/gudhi-devel>`_ is used as a C++ backend for the edge collapse algorithm described in [2]_. References ---------- .. [1] U. Bauer, "Ripser: efficient computation of Vietoris–Rips persistence barcodes", 2019; `arXiv:1908.02518 <https://arxiv.org/abs/1908.02518>`_. .. [2] J.-D. Boissonnat and S. Pritam, "Edge Collapse and Persistence of Flag Complexes"; in *36th International Symposium on Computational Geometry (SoCG 2020)*, pp. 19:1–19:15, Schloss Dagstuhl-Leibniz–Zentrum für Informatik, 2020; `DOI: 10.4230/LIPIcs.SoCG.2020.19 <https://doi.org/10.4230/LIPIcs.SoCG.2020.19>`_. .. [3] H. Anai et al, "DTM-Based Filtrations"; in *Topological Data Analysis* (Abel Symposia, vol 15), Springer, 2020; `DOI: 10.1007/978-3-030-43408-3_2 <https://doi.org/10.1007/978-3-030-43408-3_2>`_. """ if n_perm and issparse(X): raise Exception("Greedy permutation is not supported for sparse " "distance matrices") if n_perm and n_perm > X.shape[0]: raise Exception("Number of points in greedy permutation is greater " "than number of points in the point cloud") if n_perm and n_perm < 0: raise Exception("There should be a strictly positive number of points " "in the greedy permutation") idx_perm = np.arange(X.shape[0]) r_cover = 0.0 if n_perm: idx_perm, lambdas, dperm2all = \ get_greedy_perm(X, n_perm=n_perm, metric=metric) r_cover = lambdas[-1] dm = dperm2all[:, idx_perm] else: if metric == 'precomputed': dm = X else: dm = pairwise_distances(X, metric=metric, **metric_params) dperm2all = None n_points = max(dm.shape) use_sparse_computer = True if issparse(dm): row, col, data = _resolve_symmetry_conflicts(dm.tocoo()) # Upper diag if weights is not None: if (dm < 0).nnz: raise ValueError("Distance matrix has negative entries. " "Weighted Rips filtration unavailable.") weight_params = {} if weight_params is None else weight_params weights_p = weight_params.get("p", 1) # Restrict to off-diagonal entries for weights computation since # diagonal ones are given by `weights`. Explicitly set the diagonal # to 0 -- this is also important for DTM since otherwise # kneighbors_graph with include_self=False skips the first true # neighbor. off_diag = row != col row, col, data = (np.hstack([row[off_diag], np.arange(n_points)]), np.hstack([col[off_diag], np.arange(n_points)]), np.hstack([data[off_diag], np.zeros(n_points)])) if isinstance(weights, str) and (weights == "DTM"): n_neighbors = weight_params.get("n_neighbors", 3) weights_r = weight_params.get("r", 2) # CSR matrix must be symmetric for kneighbors_graph to give # correct results dm = csr_matrix((np.hstack([data, data[:-n_points]]), (np.hstack([row, col[:-n_points]]), np.hstack([col, row[:-n_points]])))) weights = _compute_dtm_weights(dm, n_neighbors, weights_r) else: weights = _check_weights(weights, n_points) data = _weight_filtration_sparse(row, col, data, weights, weights_p) if collapse_edges: row, col, data = _collapse_coo(row, col, data, thresh) else: if weights is not None: if (dm < 0).any(): raise ValueError("Distance matrix has negative entries. " "Weighted Rips filtration unavailable.") weight_params = {} if weight_params is None else weight_params weights_p = weight_params.get("p", 1) if isinstance(weights, str) and (weights == "DTM"): n_neighbors = weight_params.get("n_neighbors", 3) weights_r = weight_params.get("r", 2) if not np.array_equal(dm, dm.T): dm = np.triu(dm, k=1) dm += dm.T weights = _compute_dtm_weights(dm, n_neighbors, weights_r) else: weights = _check_weights(weights, n_points) dm = _weight_filtration_dense(dm, weights, weights_p) np.fill_diagonal(dm, weights) if (dm.diagonal() != 0).any(): # Convert to sparse format, because currently that's the only # one handling nonzero births (row, col) = np.triu_indices_from(dm) data = dm[(row, col)] if collapse_edges: row, col, data = _collapse_coo(row, col, data, thresh) elif collapse_edges: row, col, data = gtda_collapser.\ flag_complex_collapse_edges_dense(dm, thresh) else: use_sparse_computer = False if use_sparse_computer: res = DRFDMSparse(np.asarray(row, dtype=np.int32, order="C"), np.asarray(col, dtype=np.int32, order="C"), np.asarray(data, dtype=np.float32, order="C"), n_points, maxdim, thresh, coeff) else: # Only consider strict upper diagonal DParam = squareform(dm, checks=False).astype(np.float32) # Run garbage collector to free up memory taken by `dm` del dm gc.collect() res = DRFDM(DParam, maxdim, thresh, coeff) # Unwrap persistence diagrams dgms = res.births_and_deaths_by_dim for dim in range(len(dgms)): N = int(len(dgms[dim]) / 2) dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2]) ret = { "dgms": dgms, "num_edges": res.num_edges, "dperm2all": dperm2all, "idx_perm": idx_perm, "r_cover": r_cover } return ret
count += 1 # plotting the correlation matrix #http://glowingpython.blogspot.com.es/2012/10/visualizing-correlation-matrices.html R = corrcoef(transpose(semana)) pcolor(R) colorbar() yticks(arange(0, 20), range(0, 20)) xticks(arange(0, 20), range(0, 20)) show() # http://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html # Generate a mask for the upper triangle sns.set(style="white") mask = np.zeros_like(R, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(200, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(R, mask=mask, cmap=cmap, vmax=.8, square=True, xticklabels=2, yticklabels=2,
fig = plt.figure(figsize=(10, 9)) data[data.columns[1:24]].corrwith( data['default.payment.next.month']).plot.barh(fontsize=20, rot=0, grid=True) plt.title("Correlation of Explanatory variables with the targe feature", fontsize=20, fontweight='bold') plt.show() # In[296]: correlations_exvar = data[data.columns[1:24]].corr() plt.figure(figsize=(20, 15)) mask1 = np.zeros_like(correlations_exvar, dtype=np.bool) mask1[np.triu_indices_from(mask1)] = True cmap = 'Dark2' # sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(correlations_exvar, cmap=cmap, mask=mask1, annot=True, square=True, vmax=.3, center=0, linewidths=.5, cbar_kws={"shrink": 0.7}) plt.title('The correlation among %d Explanatory Variables' % len(data[data.columns[1:24]].columns), fontsize=20, fontweight='bold') plt.ylabel('The name of %d Explanatory Variable' %
def GaussGammas_Connectome_thresholding_pFDR(input_file, toolbox_path): #Add the toolbox to path sys.path.append(os.path.join(os.path.abspath(toolbox_path))) from Mixture_Model_1Dim import Mixture_Model_1Dim #load input conenctivity matrix #connectivity_matrix = np.loadtxt(input_file, delimiter=',')#, skiprows=1,skipcolumns=1) connectivity_matrix = np.genfromtxt(input_file, delimiter=',') #get updiagonal terms updiag_idx = np.triu_indices_from(connectivity_matrix, k=1) orig_data_vector = connectivity_matrix[updiag_idx] orig_data_vector = orig_data_vector[ ~np.isnan(orig_data_vector )] #data_vector=orig_data_vector[orig_data_vector>0.05] #demean and divide for std to allow easy initialization mean_factor = np.mean(orig_data_vector) scaling_factor = 1. #np.std(orig_data_vector) data_vector = np.divide(orig_data_vector - mean_factor, scaling_factor) #Define options for the mixture model fit Inference = 'Variational Bayes' #'Method of moments'#'Variational Bayes' #'Variational Bayes' #'Method of moments' OR 'Maximum Likelihood' OR 'Variational Bayes' ML NOT INCLUDED YET Number_of_Components = 3 Components_Model = [ 'Gauss', 'InvGamma', '-InvGamma' ] #,'-Gamma'] #Each component can be Gauss, Gamma, InvGamma, -Gamma, -InvGamma maxits = 500 tol = 0.00001 init_params = [0, 1, 6, 2, -6, 2] init_params = [ 0, 1, np.percentile(data_vector, 99), 2, np.percentile(data_vector, 1), 2 ] opts = { 'Inference': Inference, 'Number_of_Components': Number_of_Components, 'Components_Model': Components_Model, 'init_params': init_params, 'maxits': maxits, 'tol': tol } # CALL TO FIT MIXTURE MODEL Model = Mixture_Model_1Dim(data_vector, opts) #if Model['Mixing Prop.'][0]<.95: #good_model=1 # Visualizar fit visualize_model_fit = 1 if visualize_model_fit == 1: my_range = np.linspace(-10, 10, 10000) plt0 = np.multiply( Model['Mixing Prop.'][0], norm.pdf(my_range, Model['mu1'][0], np.sqrt(np.divide(1, Model['taus1'][0])))) #plt0=np.multiply( Model['Mixing Prop.'][0],norm.pdf(my_range,Model['mu1'][0],np.sqrt(Model['taus1'][0]) ) ) #plt0=np.multiply( Model['Mixing Prop.'][0],norm.pdf(my_range,Model['mu1'][0],Model['taus1'][0]) ) if Components_Model[1] == 'InvGamma': plt1 = np.multiply( Model['Mixing Prop.'][1], invgam(my_range, Model['shapes'][1], Model['scales'][1])) elif Components_Model[1] == 'Gamma': plt1 = np.multiply( Model['Mixing Prop.'][1], gam(my_range, Model['shapes'][1], np.divide(1, Model['rates'][1]))) plt1[my_range < 0] = 0 if Components_Model[2] == '-InvGamma': plt2 = np.multiply( Model['Mixing Prop.'][2], invgam(-my_range, Model['shapes'][2], Model['scales'][2])) elif Components_Model[2] == '-Gamma': plt2 = np.multiply( Model['Mixing Prop.'][2], gam(-my_range, Model['shapes'][2], np.divide(1, Model['rates'][2]))) plt2[my_range > 0] = 0 import matplotlib.pyplot as plt fig = plt.figure() #plt.plot(range(10)) plt.hist(data_vector, bins=50, density=True, alpha=1, color='g') plt.plot(my_range, plt0, 'k', linewidth=2) plt.plot(my_range, plt1, 'k', linewidth=2) plt.plot(my_range, plt2, 'k', linewidth=2) plt.plot(my_range, plt0 + plt1 + plt2, 'r', linewidth=2) fig.savefig(os.path.expanduser('~/Desktop/temp.png'), dpi=fig.dpi) #plt.show() # Plot the resulting fit on a histogram of the data #Compute local FDR at positive and negative tail #f0(x)=gam(x,Model['shapes'][0],np.divide(1,Model['rates'][0]))) p0 = Model['Mixing Prop.'][0] rho = data_vector.shape[0] #FDR at positive side sorted_data_vector = -np.sort(-data_vector) all_localFDR = np.ones(rho) flag = 0 k = -1 while flag == 0: k = k + 1 point = sorted_data_vector[k] cdf = norm.cdf(point, Model['mu1'][0], np.sqrt(np.divide(1, Model['taus1'][0]))) numerator = np.multiply(float(p0), 1 - cdf) denominator = np.divide(float(k + 1), float(rho)) all_localFDR[k] = np.divide(numerator, denominator) pFDR = all_localFDR[k] if pFDR > 0.001: if k == 0: threshold1 = sorted_data_vector[k] else: threshold1 = sorted_data_vector[k - 1] # np.multiply(sorted_data_vector[k-1],scaling_factor) flag = 1 #print threshold1 #FDR at negative side sorted_data_vector = -np.sort(data_vector) all_localFDR = np.ones(rho) flag = 0 k = -1 while flag == 0: k = k + 1 point = sorted_data_vector[k] cdf = norm.cdf(-point, Model['mu1'][0], np.sqrt(np.divide(1, Model['taus1'][0]))) numerator = np.multiply(float(p0), 1 - cdf) denominator = np.divide(float(k + 1), float(rho)) all_localFDR[k] = np.divide(numerator, denominator) pFDR = all_localFDR[k] if pFDR > 0.001: if k == 0: threshold2 = -sorted_data_vector[k] else: threshold2 = -sorted_data_vector[k - 1] # np.multiply(sorted_data_vector[k-1],scaling_factor) flag = 1 #Rescale the thresholds using the data mean and std threshold1 = np.multiply(threshold1, scaling_factor) + mean_factor threshold2 = np.multiply(threshold2, scaling_factor) + mean_factor print threshold1 print threshold2 return threshold1, threshold2, Model
def scores(key, paths, config, as_dataframe=False): import mapreduce print(key) if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): print("Failed for key %s" % key) return None values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = [item["proba_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_recall_mean = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T # Correlation R = np.corrcoef(betas) #print R R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5, # rtol=0, atol=1e-02)) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 scores = OrderedDict() scores['key'] = key try: a, l1, l2, tv = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores["auc"] = auc scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_recall_mean'] = pvalue_recall_mean scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores