def ori_correc( self, overwrite=False ): #Outputs corrected orientations based on inputted directions. #Should be given two arrays of numbers correc = convert180to360( self.orientations) #Put orientations in a 360 degrees format flipped = flip(correc) #Flips orientations minus = flip(correc, -90) plus = flip(correc, 90) #Calculate correlation for the 4 possibilities correl = dict() correl['flipped'] = ma.corrcoef(ma.masked_invalid(flipped), ma.masked_invalid(self.directions))[1, 0] correl['minus'] = ma.corrcoef(ma.masked_invalid(minus), ma.masked_invalid(self.directions))[1, 0] correl['plus'] = ma.corrcoef(ma.masked_invalid(plus), ma.masked_invalid(self.directions))[1, 0] correl['correc'] = ma.corrcoef(ma.masked_invalid(correc), ma.masked_invalid(self.directions))[1, 0] #get optimum opti = max(correl.items(), key=operator.itemgetter(1))[0] #Outputs better corrected orientations if opti == 'flipped': print("Orientations have been corrected and flipped 180 degrees.") print("Correlation is ", correl['flipped']) if overwrite == True: self.orientations = flipped else: return flipped elif opti == 'minus': print("orientations have been corrected and flipped -90 degrees") print("Correlation is ", correl['minus']) if overwrite == True: self.orientations = minus else: return minus elif opti == 'plus': print("orientations have been corrected and flipped +90 degrees") print("Correlation is ", correl['plus']) if overwrite == True: self.orientations = plus else: return plus elif opti == 'correc': print("orientations have been corrected only") print("Correlation is ", correl['correc']) if overwrite == True: self.orientations = correc else: return correc
def pears_corr_obs(corr_obs_ts, corr_ts, use_log): """ Pearson-Correlation of modeled and observed damages ---------- corr_obs_ts : np.array observed damages corr_ts : np.array damages to be correlated use_log : string correlation in log space Returns ------- CorrelationObject """ if use_log: a = ma.masked_invalid(np.log10(corr_obs_ts).replace([-np.inf, np.inf], [np.nan, np.nan])) b = ma.masked_invalid(np.log10(corr_ts)) msk = (~a.mask & ~b.mask) corrcoef = ma.corrcoef(a[msk], b[msk]) # corrcoef = stats.spearmanr(a[msk], b[msk]) else: a = ma.masked_invalid(corr_obs_ts.replace([-np.inf, np.inf], [np.nan, np.nan])) b = ma.masked_invalid(corr_ts) msk = (~a.mask & ~b.mask) corrcoef = ma.corrcoef(a[msk], b[msk]) # corrcoef = stats.spearmanr(a[msk], b[msk]) return corrcoef
def do_analysis(medians_df, voa_pred_df, p533_pred_df): # Build a masked array of median values for each hour. The mask hides missing UTC values. medians_ma = ma.masked_values( [medians_df['median_pwr'].get(utc, 1.e20) for utc in range(0, 24)], 1.e20) #print(type(medians_ma)) p533_corr = ma.corrcoef(medians_ma, np.array(p533_pred_df['rx_pwr']))[0, 1] p533_rmse = get_rmse(medians_ma, np.array(p533_pred_df['rx_pwr'])) voa_corr = ma.corrcoef(medians_ma, np.array(voa_pred_df['rx_pwr']))[0, 1] voa_rmse = get_rmse(medians_ma, np.array(voa_pred_df['rx_pwr'])) voacap_residuals = voa_pred_df['rx_pwr'].subtract(medians_df['median_pwr']) p533_residuals = p533_pred_df['rx_pwr'].subtract(medians_df['median_pwr']) # OR the mask with a mask for prob muf <= 0.03 (1 day) (True = value is masked.) medians_ma.mask = medians_ma.mask | [ False if x > 0.03 else True for x in voa_pred_df['muf_day'].tolist() ] muf_day = y = np.array(voa_pred_df['muf_day'].tolist()) p533_corr_gt_1d = ma.corrcoef(medians_ma, np.array(p533_pred_df['rx_pwr']))[0, 1] p533_rmse_gt_1d = get_rmse(medians_ma, np.array(p533_pred_df['rx_pwr'])) voa_corr_gt_1d = ma.corrcoef(medians_ma, np.array(voa_pred_df['rx_pwr']))[0, 1] voa_rmse_gt_1d = get_rmse(medians_ma, np.array(voa_pred_df['rx_pwr'])) #p533_residuals_gt_1d = p533_pred_df['rx_pwr'].subtract(medians_df['median_pwr']) voacap_residuals_gt_1d = np.ma.masked_where(muf_day <= 0.03, voacap_residuals).compressed() #voacap_residuals_gt_1d = voa_pred_df['rx_pwr'].subtract(medians_df['median_pwr']) p533_residuals_gt_1d = np.ma.masked_where(muf_day <= 0.03, p533_residuals).compressed() voa_residual_mean_gt_1d = np.mean(voacap_residuals_gt_1d) voa_residual_sd_gt_1d = np.std(voacap_residuals_gt_1d) p533_residual_mean_gt_1d = np.mean(p533_residuals_gt_1d) p533_residual_sd_gt_1d = np.std(p533_residuals_gt_1d) #print(voacap_residuals) return ({ "p533_rmse": p533_rmse, "p533_corr": p533_corr, "voa_rmse": voa_rmse, "voa_corr": voa_corr, "p533_rmse_gt_1d": p533_rmse_gt_1d, "p533_corr_gt_1d": p533_corr_gt_1d, "voa_rmse_gt_1d": voa_rmse_gt_1d, "voa_corr_gt_1d": voa_corr_gt_1d }, voacap_residuals, p533_residuals, voacap_residuals_gt_1d, p533_residuals_gt_1d)
def crosscorrcoef(x,y) : """Take the numpy.ma.corrcoef function that deals with missing data, and automatically return cross correlation element in the matrix.""" # return cross correlation return corrcoef(x,y)[0,1]
def autocorr_sA_sB(self, sep=1): """Compute the autocorrelation across windows separated by a distance. NOTE : this is meant to be a faster alternative to the monte-carlo sampling approach """ assert self.chrom_total_dict is not None corrs = [] rec_dists = [] for c in self.chrom_total_dict: # Grabbing the current version of the data for this chromosome cur_tot_data = self.chrom_total_dict[c] win_vars = cur_tot_data[0, :] rec_midpts = cur_tot_data[2, :] mask_weights = cur_tot_data[3, :] # Weight according to a mask - if it exists x = mask_weights * win_vars # Compute the empirical correlation here x1s = x[sep:] x2s = x[:-sep] # Setting the mask here a = ma.masked_invalid(x1s) b = ma.masked_invalid(x2s) corr_est = ma.corrcoef(a, b)[0, 1] # Should it be the mean or something else here rec_dist_mean = np.nanmean(rec_midpts[sep:] - rec_midpts[:-sep]) corrs.append(corr_est) rec_dists.append(rec_dist_mean) corrs = np.array(corrs, dtype=np.float32) rec_dists = np.array(rec_dists, dtype=np.float32) return (rec_dists, corrs)
def correlate_all(M): """Return all pairs correlation matrix. Note: dot product optimizations cannot be as readily applied due to different pairs discarded due to missing values per pair. """ return ma.corrcoef(M)
def get_mutual_info(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False) -> float: """ Get mutual info score for x and y described in https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes (p.16). :param x: (np.array) x vector :param y: (np.array) y vector :param n_bins: (int) number of bins for discretization, if None get number of bins based on correlation coefficient. :param normalize: (bool) True to normalize the result to [0, 1]. :return: (float) mutual info score. """ good_indices = ~(np.isnan(x) | np.isnan(y) | np.isinf(x) | np.isinf(y)) if n_bins is None: import numpy.ma as ma corr_coef = ma.corrcoef(ma.masked_invalid(x), ma.masked_invalid(y))[0][1] n_bins = get_optimal_number_of_bins(x[good_indices].shape[0], corr_coef=corr_coef) contingency = np.histogram2d(x[good_indices], y[good_indices], n_bins)[0] mutual_info = mutual_info_score( None, None, contingency=contingency) # Mutual information if normalize is True: marginal_x = ss.entropy(np.histogram(x[good_indices], n_bins)[0]) # Marginal for x marginal_y = ss.entropy(np.histogram(y[good_indices], n_bins)[0]) # Marginal for y mutual_info /= min(marginal_x, marginal_y) return mutual_info
def corr_coefficient(predictions,targets,bias=False): """Calculates the correlation coefficient (the 'r' in '-squared' between two series. For time series where the targets are serially correlated and may span only a fraction of the natural variability, this statistic may not be appropriate and Murphy (1988) explains why caution should be exercised in using this statistic. Parameters ---------- predictions, targets : array_like Time series to analyze bias : boolean Whether to use the biased (N) or unbiased (N-1) sample size for normalization Returns ------- r : float Correlation coefficient """ from numpy.ma import corrcoef y = numpy.ma.masked_invalid(predictions) z = numpy.ma.masked_invalid(targets) return corrcoef(y,z,bias)[0][1]
def compute_hess_corr(eigval_col, eigvec_col, fdnm=""): posN = len(eigval_col) T0 = time() corr_mat_log = np.zeros((posN, posN)) corr_mat_lin = np.zeros((posN, posN)) for eigi in tqdm(range(posN)): for eigj in range(posN): eva_i, evc_i = eigval_col[eigi], eigvec_col[ eigi] # torch.from_numpy(eigvect_j).cuda() eva_j, evc_j = eigval_col[eigj], eigvec_col[ eigj] # torch.from_numpy(eigval_j).cuda() inpr = evc_i.T @ evc_j vHv_ij = np.diag((inpr * eva_j[np.newaxis, :]) @ inpr.T) corr_mat_log[eigi, eigj] = ma.corrcoef( ma.masked_invalid(np.log10(vHv_ij)), ma.masked_invalid(np.log10(eva_j)))[0, 1] corr_mat_lin[eigi, eigj] = np.corrcoef(vHv_ij, eva_j)[0, 1] # corr_mat_log[eigi, eigj] = corr_nan_torch(vHv_ij.log10(), eva_j.log10()) # corr_mat_lin[eigi, eigj] = corr_nan_torch(vHv_ij, eva_j) # vHv_ij = np.diag(eigvec_col[eigi].T @ H_col[eigj] @ eigvec_col[eigi]) print("%.1f sec" % (time() - T0)) # 582.2 secs for the 1000 by 1000 mat. not bad! np.savez(join(figdir, "Hess_%s_corr_mat.npz" % fdnm), corr_mat_log=corr_mat_log, corr_mat_lin=corr_mat_lin) return corr_mat_log, corr_mat_lin
def get_pearson(pred, climdat): """ pearson correlation of model predicted data and damage time series Parameters ---------- pred : GLM model climdat : np.array damage time series Returns ------- float Pearson correlation coefficient """ a = ma.masked_invalid(climdat) b = ma.masked_invalid(pred.predict()) msk = (~a.mask & ~b.mask) corrcoef = ma.corrcoef(a[msk], b[msk]) # corrcoef = stats.spearmanr(a[msk], b[msk]) return corrcoef[0, 1]
def gen_binned_estimates(rec_dists, s1, s2, **kwargs): """Get binned estimates of the correlation in segregating sites.""" _, bins = np.histogram(rec_dists, **kwargs) bin_idx = np.digitize(rec_dists, bins) bin_idx = bin_idx - 1 # Setting up the accumulator vectors here rec_rate_mean = np.zeros(np.max(bin_idx)) rec_rate_se = np.zeros(np.max(bin_idx)) corr_s1_s2 = np.zeros(np.max(bin_idx)) se_r = np.zeros(np.max(bin_idx)) for i in range(np.max(bin_idx)): cur_idx = bin_idx == i n_pairs = np.sum(cur_idx) cur_rec_rates = rec_dists[cur_idx] rec_rate = np.nanmean(cur_rec_rates) se_rec_rate = np.nanstd(cur_rec_rates) # TODO : take concatenation of the masks corr_s1_s2_cur = ma.corrcoef( ma.masked_invalid(s1[cur_idx]), ma.masked_invalid(s2[cur_idx]) )[0, 1] se_r_cur = np.sqrt((1.0 - corr_s1_s2_cur ** 2) / (n_pairs - 2)) # Set the accumulators to return rec_rate_mean[i] = rec_rate rec_rate_se[i] = se_rec_rate corr_s1_s2[i] = corr_s1_s2_cur se_r[i] = se_r_cur # Return the different accumulators as we have them return (rec_rate_mean, rec_rate_se, corr_s1_s2, se_r)
def _get_corr_arr( self ): corr_data = getattr( self.yarn_data, self.var_enum_ ) #corr_data = corr_data[prod( corr_data >= 0, axis=1, dtype=bool )] import numpy.ma as ma corr_data = ma.masked_array( corr_data, mask = self.yarn_data.mask_arr ) # return small differences between ma and numpy corrcoef #print ma.corrcoef( corr_data, rowvar=False, allow_masked=True, bias=False ) return ma.corrcoef( corr_data, rowvar = False, allow_masked = True )
def r_rmse(obs_series, model_series): R = ma.corrcoef(ma.masked_invalid(obs_series), ma.masked_invalid(model_series)) x = obs_series[~np.isnan(obs_series)] y = model_series[~np.isnan(model_series)] rmse = np.sqrt(((y - x) ** 2).mean()) format_R = float("{0:.2f}".format(R[0,1])) format_rmse = float("{0:.2f}".format(rmse)) return format_R, format_rmse
def match(dbz1, dbz2): """ input: two armor.pattern.DBZ objects output: just their correlation **** PROBLEM: need to resolve the grid problem with e.g. interpolation """ size = dbz1.matrix.size return ma.corrcoef(dbz1.matrix.reshape(size), dbz2.matrix.reshape(size))[0,1]
def nancorr(x, y): """ r = nancorr(x,y) Calculate correlation matrix, treating NaN values as missing data """ x_msk = ma.masked_invalid(x) y_msk = ma.masked_invalid(y) r = ma.corrcoef(x_msk, y_msk) return r
def test_crossval_Melanoma(): """ Tests the cross val function that creates the train and test data. """ data = ImportMelanoma().to_numpy() train_X, test_X = split_data(data) full_X = impute(train_X) print( ma.corrcoef(ma.masked_invalid(full_X.flatten()), ma.masked_invalid(test_X.flatten())))
def make_scatterplot_heights(preds, lbls, preds_horavg, lbls_horavg, heights, component, time_step): #NOTE1: third last input of this function is a string indicating the name of the component being plotted. for k in range(len(heights) + 1): if k == len(heights): preds_height = preds_horavg[:] / (utau_ref**2.) lbls_height = lbls_horavg[:] / (utau_ref**2.) else: preds_height = preds[k, :, :] / (utau_ref**2.) lbls_height = lbls[k, :, :] / (utau_ref**2.) preds_height = preds_height.flatten() lbls_height = lbls_height.flatten() #Make scatterplots of Smagorinsky/CNN fluxes versus labels corrcoef = np.round( ma.corrcoef(preds_height, lbls_height)[0, 1], 3 ) #Calculate, extract, and round off Pearson correlation coefficient from correlation matrix plt.figure() plt.scatter(lbls_height, preds_height, s=6, marker='o', alpha=0.2) if k == len(heights): #plt.xlim([-0.004, 0.004]) #plt.ylim([-0.004, 0.004]) #plt.xlim([-0.000004, 0.000004]) #plt.ylim([-0.000004, 0.000004]) plt.xlim([-2.0, 2.0]) plt.ylim([-2.0, 2.0]) else: plt.xlim([-2.0, 2.0]) plt.ylim([-2.0, 2.0]) #plt.xlim([-15.0, 15.0]) #plt.ylim([-15.0, 15.0]) #plt.xlim([-40.0, 40.0]) #plt.ylim([-40.0, 40.0]) #plt.xlim([-0.0005, 0.0005]) #plt.ylim([-0.0005, 0.0005]) axes = plt.gca() plt.plot(axes.get_xlim(), axes.get_ylim(), 'b--') #plt.gca().set_aspect('equal',adjustable='box') plt.xlabel(r'$\rm \frac{\tau_{wu}^{DNS}}{u_{\tau}^2} \,\ {[-]}$', fontsize=20) plt.ylabel(r'$\rm \frac{\tau_{wu}^{smag}}{u_{\tau}^2} \,\ {[-]}$', fontsize=20) #plt.title("ρ = " + str(corrcoef),fontsize = 20) plt.axhline(c='black') plt.axvline(c='black') plt.xticks(fontsize=16, rotation=90) plt.yticks(fontsize=16, rotation=0) if k == len(heights): plt.savefig("Scatter_Smagorinsky_tau_" + component + "_horavg.png", dpi=200) else: plt.savefig("Scatter_Smagorinsky_tau_" + component + "_" + str(heights[k]) + ".png", dpi=200) plt.tight_layout() plt.close()
def squared_angular_distance(x: np.array, y: np.array) -> float: """ Returns a modification of angular distance where square of correlation coefficient is used. :param x: (np.array) X vector :param y: (np.array) Y vector :return: (float) squared angular distance """ corr_coef = ma.corrcoef(ma.masked_invalid(x), ma.masked_invalid(y))[0][1] return np.sqrt(0.5 * (1 - corr_coef**2))
def repeatImputation(data, linear=False, numIter=20): """ Repeat imputation and calculate the average of cost for 20 iterations. """ coefs = [] for _ in range(numIter): train_X, test_X = split_data(data) full_X = impute(train_X, linear) corr_coef = ma.corrcoef(ma.masked_invalid(full_X.flatten()), ma.masked_invalid(test_X.flatten())) coefs.append(corr_coef[0][1]) print(f"average corr coef: {sum(coefs)/len(coefs)}") return coefs
def angular_distance(x: np.array, y: np.array) -> float: """ Returns angular distance between two vectors. Angular distance is a slight modification of correlation which satisfies metric conditions. :param x: (np.array) X vector. :param y: (np.array) Y vector. :return: (float) angular distance. """ corr_coef = ma.corrcoef(ma.masked_invalid(x), ma.masked_invalid(y))[0][1] return np.sqrt(0.5 * (1 - corr_coef))
def absolute_angular_distance(x: np.array, y: np.array) -> float: """ Returns a modification of angular distance where absolute value of correlation coefficient is used. :param x: (np.array) x vector :param y: (np.array) y vector :return: (float) absolute angular distance """ corr_coef = ma.corrcoef(ma.masked_invalid(x), ma.masked_invalid(y))[0][1] return np.sqrt(0.5 * (1 - abs(corr_coef)))
def ccf(x, y, lags): x = x - x.mean() # remove mean y = y - y.mean() if type(lags) is int: lags = range(lags) C = ma.zeros((len(lags), 1)) for i, l in enumerate(lags): if l == 0: C[i] = 1 else: C[i] = ma.corrcoef(x[:-l], y[l:])[0, 1] return C
def maxcorr(x, y, **options): """ (rmax,lag,ind) = maxcorr(x,y,**'maxlag'=int(len(x)/4)): Calculate the maximum lagged correlation between two 1D arrays Inputs: x,y are 1D arrays Options 'maxlag' the maximum number of lagged correlations to calculate (default: 1/4 of array length) Output: r is the correlation coefficient with the maximum absolute value lag is the lag of the maximum correlation (positive: y lags x) """ nrows = len(x) maxlag = int(np.floor(nrows / 4)) if ('maxlag' in options): maxlag = options['maxlag'] # use masked arrays (mask NaNs) x = ma.masked_invalid(x) y = ma.masked_invalid(y) lags = np.arange(-maxlag, maxlag + 1) rs = np.zeros(np.shape(lags)) for ni, lag in enumerate(lags): lag = lags[ni] if lag < 0: rs[ni] = ma.corrcoef(x[-lag:], y[:lag])[0, 1] elif lag > 0: rs[ni] = ma.corrcoef(x[:-lag], y[lag:])[0, 1] else: rs[ni] = ma.corrcoef(x, y)[0, 1] ind = ma.argmax(np.abs(rs)) rmax = rs[ind] lag = lags[ind] return (rmax, lag, ind)
def rm_pears_corr_obs(corr_obs_ts, corr_ts, use_log): """ Pearson-Correlation of modeled and observed damages, applying a running mean before (3yr) ---------- corr_obs_ts : np.array observed damages corr_ts : np.array damages to be correlated use_log : string correlation in log space Returns ------- CorrelationObject """ rm_obs = runmean(np.array(corr_obs_ts), 1) rm_ts = runmean(np.array(corr_ts), 1) if use_log: a = ma.masked_invalid(np.log10(rm_obs).replace([-np.inf, np.inf], [np.nan, np.nan])) b = ma.masked_invalid(np.log10(rm_ts)) msk = (~a.mask & ~b.mask) corrcoef = ma.corrcoef(a[msk], b[msk]) # corrcoef = stats.spearmanr(a[msk], b[msk]) else: a = ma.masked_invalid(rm_obs) b = ma.masked_invalid(rm_ts) msk = (~a.mask & ~b.mask) corrcoef = ma.corrcoef(a[msk], b[msk]) # corrcoef = stats.spearmanr(a[msk], b[msk]) return corrcoef
def normalised_corr(dataFrame, tot_mod_dam, tot_pred_dam): """ This function adjusts for vulnerability, applying a GDP fit, either in the log space or the linear space. All relevant columns are normalised before they are correlated. Parameters ---------- ratio : Column of DataFrame Ratio of recorded to modeled damages Returns ------- np.arrays Ratios with different window sizes """ facE = tot_pred_dam/tot_mod_dam facV = tot_pred_dam/tot_pred_dam facNatCat = tot_pred_dam/dataFrame['natcat_flood_damages_2005_CPI'].sum() pred_norm = dataFrame['Impact_Pred'] * facV mod_norm = dataFrame['Impact_2y_Flopros'] * facE natCat_norm = dataFrame['natcat_flood_damages_2005_CPI'] * facNatCat a = ma.masked_invalid(natCat_norm.replace([-np.inf, np.inf], [np.nan, np.nan])) b = ma.masked_invalid(pred_norm) msk = (~a.mask & ~b.mask) pred_corrcoef = ma.corrcoef(a[msk], b[msk]) a = ma.masked_invalid(natCat_norm.replace([-np.inf, np.inf], [np.nan, np.nan])) b = ma.masked_invalid(mod_norm) msk = (~a.mask & ~b.mask) mod_corrcoef = ma.corrcoef(a[msk], b[msk]) return pred_corrcoef, mod_corrcoef
def apply_pca_pearson(client_trace, server_trace): """ Applies PCA to input data and compares transformed data via Pearson correlation. """ coeffs = [] n_components = 3 for feature in xrange(0, len(client_trace[0])): tmp_coeff = 0 try: pca = decomposition.PCA(n_components) pca.fit(client_trace) client_pca = pca.transform(client_trace) pca.fit(server_trace) server_pca = pca.transform(server_trace) except Exception as err: print 'Problem applying PCA: ', err try: for i in xrange(0, n_components): shrinked_client = client_pca[0:1000] shrinked_server = server_pca[0:1000] shrinked_client = [row[i] for row in shrinked_client] shrinked_server = [row[i] for row in shrinked_server] limitation = min(len(shrinked_client), len(shrinked_server)) shrinked_client = shrinked_client[0:limitation] shrinked_server = shrinked_server[0:limitation] cor = corrcoef(transpose(shrinked_client), transpose(shrinked_server)) correlation_coefficient = abs(cor.data[1][0]) if correlation_coefficient > tmp_coeff: tmp_coeff = correlation_coefficient except Exception as err: print 'Error applying PCA-Pearson: ', err coeffs.append(tmp_coeff) return coeffs
def test_corrcoef(self): r = ma.masked_equal(np.load("data/ml-1m/rating.npy"), 0) # sim = ma.corrcoef(r[0], r[2412]) # print(sim) # print(np.corrcoef(r[0].filled(0), r[2412].filled(0))) sim2 = ma.corrcoef(ma.vstack([r[0], r[2412]])) print(sim2) print(ma.dot(r[0], r[2412])/math.sqrt(ma.dot(r[0],r[0]))/math.sqrt(ma.dot(r[2412],r[2412]))) r0_m = r[0] - ma.mean(r[0]) r1_m = r[2412] - ma.mean(r[2412]) print(ma.dot(r0_m, r1_m)/math.sqrt(ma.dot(r0_m,r0_m))/math.sqrt(ma.dot(r1_m,r1_m)))
def predict_corr(self, *views: Tuple[np.ndarray, ...], **kwargs) -> np.ndarray: """ Predicts the correlation for the given data using the fit model :param views: numpy arrays with the same number of rows (samples) separated by commas :param kwargs: any additional keyword arguments required by the given model :return: all_corrs: an array of the pairwise correlations (k,k,self.latent_dims) where k is the number of views :rtype: np.ndarray """ # Takes two views and predicts their out of sample correlation using trained model transformed_views = self.transform(*views, **kwargs) all_corrs = [] for x, y in itertools.product(transformed_views, repeat=2): all_corrs.append(np.diag(ma.corrcoef(x.T, y.T)[:self.latent_dims, self.latent_dims:])) all_corrs = np.array(all_corrs).reshape((len(views), len(views), self.latent_dims)) return all_corrs
def masked_corrcoef2d(arr1, arr2): """ Correlation coefficient of two 2 dimensional masked arrays. Parameters ---------- arr1 : np.array 2D array. arr2 : np.array 2D array. See also -------- numpy.corrcoef : NumPy corrcoef function. numpy.ma : NumPy mask module. Returns ------- corr : np.array correlation coefficient from np.corrcoef. Example -------- >>> import numpy.ma as ma >>> a = np.reshape(np.arange(10), (2,5)) >>> v = np.reshape(np.arange(10), (2,5)) >>> mask = np.zeros((2, 5), dtype=bool) >>> mask[1:, 3:] = True >>> v = ma.masked_array(v, mask=mask) >>> print(v) [[0 1 2 3 4] [5 6 7 -- --]] >>> masked_corrcoef2d(a, v) masked_array(data = [[1.0 1.0] [1.0 1.0]], mask = [[False False] [False False]], fill_value = 1e+20) <BLANKLINE> """ import numpy.ma as ma a_ = np.reshape(arr1, (1, arr1.size)) v_ = np.reshape(arr2, (1, arr2.size)) corr = ma.corrcoef(a_, v_) return corr
def apply_pearson(client_trace, server_trace): coeffs = [] for feature in xrange(0, len(client_trace[0])): feature_client = [row[feature] for row in client_trace] feature_server = [row[feature] for row in server_trace] limitation = min(len(feature_client), len(feature_server)) feature_client = feature_client[0:limitation] feature_server = feature_server[0:limitation] try: cor = corrcoef(feature_client, feature_server) correlation_coefficient = abs(cor.data[1][0]) coeffs.append(correlation_coefficient) except Exception as err: print 'Error applying Pearson: ', err coeffs.append(0) return coeffs
def plotMinFFD(df): from statsmodels.tsa.stattools import adfuller import numpy.ma as ma out = pd.DataFrame( columns=['adfStat', 'pVal', 'lags', 'nObs', '95% conf', 'corr']) for d in np.linspace(0, 1, 21): df1 = np.log(df[[ 'Close' ]]).resample('1D').last() # Pasar a observaciones diarias df2 = fracDiff(df1, d, thres=.01) corr = ma.corrcoef(ma.masked_invalid(df1.loc[df2.index, 'Close']), ma.masked_invalid(df2['Close']))[0, 1] df2 = adfuller(df2['Close'], maxlag=1, regression='c', autolag=None) out.loc[d] = list(df2[:4]) + [df2[4]['5%'] ] + [corr] # Aportar valores criticos out[['adfStat', 'corr']].plot(secondary_y='adfStat') plt.axhline(out['95% conf'].mean(), linewidth=1, color='r', linestyle='dotted') plt.show() return out
def acf(x, lags=500, exclude=None): if exclude is None: exclude = np.zeros(x.shape) exclude = np.cumsum(exclude.astype(int)) # from stackexchange x = x - x.mean() # remove mean if type(lags) is int: lags = range(lags) C = ma.zeros((len(lags),)) for i, l in enumerate(lags): if l == 0: C[i] = 1 else: x0 = x[:-l].copy() x1 = x[l:].copy() reject = (exclude[l:]-exclude[:-l])>0 x0[reject] = ma.masked x1[reject] = ma.masked C[i] = ma.corrcoef(x0, x1)[0, 1] return C
def KGEglobal(s, o): warnings.filterwarnings("ignore", message="divide by zero encountered") warnings.filterwarnings("ignore", message="invalid value encountered") warnings.filterwarnings("ignore", message="Mean of empty slice") warnings.filterwarnings("ignore", message="Degrees of freedom") B = np.nanmean(s, axis=0) / np.nanmean(o, axis=0) pbias = np.nansum((s - o), axis=0) / np.nansum(o, axis=0) y = (np.nanstd(s, axis=0) / np.nanmean(s, axis=0)) / ( np.nanstd(o, axis=0) / np.nanmean(o, axis=0)) NS = 1 - np.nansum((s - o)**2, axis=0) / np.nansum( (o - np.nanmean(o, axis=0))**2, axis=0) r = np.empty(s.shape[1]) for i in range(s.shape[1]): s1 = ma.masked_invalid(s[:, i]) o1 = ma.masked_invalid(o[:, i]) msk = (~o1.mask & ~s1.mask) r[i] = ma.corrcoef(o1[msk], s1[msk]).data[0, 1] KGE = 1 - np.sqrt((r - 1)**2 + (B - 1)**2 + (y - 1)**2) return KGE, NS, r, pbias
# Calculate BM x/y ratio. bm_x = np.nanstd(x_list, axis=0) bm_y = np.nanstd(y_list, axis=0) bm_avr = bm_x / bm_y # %% column_fail_xy_ratio = [ column for (column, bm) in enumerate(bm_avr) if bm < xy_ratio_min or bm > xy_ratio_max ] corrcoefs = [ abs( ma.corrcoef(ma.masked_invalid(x_list[:, aoi]), ma.masked_invalid(y_list[:, aoi]))[0, 1]) for aoi in range(valid_num_aoi) ] column_fail_corrcoef = [ column for (column, corrcoef) in enumerate(corrcoefs) if corrcoef > try_corrcoef ] column_delete = sorted(list(set(column_fail_xy_ratio + column_fail_corrcoef))) x_list = np.delete(x_list, column_delete, axis=1) y_list = np.delete(y_list, column_delete, axis=1) aoi_ids = np.delete(aoi_ids, column_delete) valid_num_aoi = len(aoi_ids)
y_array[j, :, :], delimiter=",") else: np.savetxt("saved_data/x_" + str(t) + "_degree_warming_cmip5.csv", x_array[j, :, :], delimiter=",") np.savetxt("saved_data/y_" + str(t) + "_degree_warming_cmip5.csv", y_array[j, :, :], delimiter=",") # saving the r coefficient for x_array and y_array at each temperature change x_array_flatten = x_array[j, :, :] x_array_flatten = x_array_flatten.flatten() y_array_flatten = y_array[j, :, :] y_array_flatten = y_array_flatten.flatten() r_coeffient = ma.corrcoef(ma.masked_invalid(x_array_flatten), ma.masked_invalid(y_array_flatten)) print('CMIP5 r-coefficient (all rcps)', t, r_coeffient) if temperature_change_options[j] == 0.5: np.savetxt("saved_data/cmip5_xy_rcoefficient_05_degree_warming.csv", r_coeffient, delimiter=",") else: np.savetxt("saved_data/cmip5_xy_rcoefficient_" + str(t) + "_degree_warming.csv", r_coeffient, delimiter=",") # saving the observational derived constrained values if temperature_change_options[j] == 0.5: np.savetxt("saved_data/obs_constraint_05_degree_warming_cmip5.csv", obs_array[j, :],
def Taylor_diag(series,names): """ Taylor Diagram : obs is reference data sample in a full diagram (0 --> npi) -------------------------------------------------------------------------- Input: series - dict with all time series (lists) to analyze series[0] - is the observation, the reference by default. """ from matplotlib.projections import PolarAxes corr,std ={},{} for i in series.keys(): corr[i] = ma.corrcoef(series[0],series[i])[1,0] std[i] = ma.std(series[i])/ma.std(series[0]) ref = 1# ma.std(series[0]) #print corr rlocs = np.concatenate((np.arange(0,-10,-0.25),[-0.95,-0.99],np.arange(0,10,0.25),[0.95,0.99])) str_rlocs = np.concatenate((np.arange(0,10,0.25),[0.95,0.99],np.arange(0,10,0.25),[0.95,0.99])) tlocs = np.arccos(rlocs) # Conversion to polar angles gl1 = GF.FixedLocator(tlocs) # Positions tf1 = GF.DictFormatter(dict(zip(tlocs, map(str,rlocs)))) str_locs2 = np.arange(-10,11,0.5) tlocs2 = np.arange(-10,11,0.5) # Conversion to polar angles g22 = GF.FixedLocator(tlocs2) tf2 = GF.DictFormatter(dict(zip(tlocs2, map(str,str_locs2)))) tr = PolarAxes.PolarTransform() smin = 0 smax = 2.5 ghelper = FA.GridHelperCurveLinear(tr, extremes=(0,np.pi, # 1st quadrant smin,smax), grid_locator1=gl1, #grid_locator2=g11, tick_formatter1=tf1, tick_formatter2=tf2, ) fig = plt.figure(figsize=(10,5), dpi=100) ax = FA.FloatingSubplot(fig, 111, grid_helper=ghelper) fig.add_subplot(ax) ax.axis["top"].set_axis_direction("bottom") ax.axis["top"].toggle(ticklabels=True, label=True) ax.axis["top"].major_ticklabels.set_axis_direction("top") ax.axis["top"].label.set_axis_direction("top") ax.axis["top"].label.set_text("Correlation Coefficient") ax.axis["left"].set_axis_direction("bottom") ax.axis["left"].label.set_text("Standard Deviation") ax.axis["right"].set_axis_direction("top") ax.axis["right"].toggle(ticklabels=True, label=True) ax.axis["right"].set_visible(True) ax.axis["right"].major_ticklabels.set_axis_direction("bottom") #ax.axis["right"].label.set_text("Standard Deviation") ax.axis["bottom"].set_visible(False) ax.grid(True) ax = ax.get_aux_axes(tr) t = np.linspace(0, np.pi) r = np.zeros_like(t) + ref ax.plot(t,r, 'k--', label='_') rs,ts = np.meshgrid(np.linspace(smin,smax), np.linspace(0,np.pi)) rms = np.sqrt(ref**2 + rs**2 - 2*ref*rs*np.cos(ts)) CS =ax.contour(ts, rs,rms,cmap=cm.bone) plt.clabel(CS, inline=1, fontsize=10) ax.plot(np.arccos(0.9999),ref,'k',marker='*',ls='', ms=10) aux = range(1,len(corr)) #del aux[ref] colors = plt.matplotlib.cm.jet(np.linspace(0,1,len(corr))) for i in aux: ax.plot(np.arccos(corr[i]), std[i],c=colors[i],alpha=0.7,marker='o',label="%s" %names[i]) ax.text(np.arccos(corr[i]), std[i],"%s"%i) legend(bbox_to_anchor=(1.5, 1),prop=dict(size='large'),loc='best') plt.savefig('example.png', dpi=500) return
def process(t,matrix_name,normalization,order,iterations,exposant,gaussian_number,convolution_sigma): s = np.copy(t) mat = s if matrix_name != "raw": print "Normalizing with "+str(normalization)+" norm..." if normalization == "fragment-wise": floatorder = np.float64(order) s_norm_x = np.linalg.norm(s, ord=floatorder, axis=0) s_norm_y = np.linalg.norm(s, ord=floatorder, axis=1) s_norm = np.tensordot(s_norm_x,s_norm_y,axes=0) s[s_norm!=0] = s[s_norm!=0]/s_norm[s_norm!=0] print "Normalized "+str(normalization)+" with order "+str(order) elif normalization == "matrix-wise": floatorder = np.float64(order) s_norm = np.linalg.norm(s, ord=floatorder) s = s/s_norm print "Normalized "+str(normalization)+" with order "+str(order) elif normalization == "SCN": for iteration in range(1,iterations): sumrow = s.sum(axis=1)[:,None] sumcols = s.sum(axis=0)[None,:] s[sumrow!=0] = s[sumrow!=0]/sumrow[sumrow!=0] s[sumcols!=0] = s[sumcols!=0]/sumcols[sumcols!=0] print "Normalized "+str(iteration+1)+" time"+str("" if iteration <= 1 else "s") s = (s+s.T)/2 elif normalization == "mirnylib": s_mirny = ntls.iterativeCorrection(s, iterations)[0] s = s_mirny print "Normalized "+str(iterations)+" time"+str("" if iterations <= 1 else "s") elif normalization == "sparsity": M = s.sum() sums = s.sum(axis=0) C = [[sums[i]*sums[j] for i in range(len(sums))] for j in range(len(sums))]/M s_coverage = s s_coverage[C!=0] /= C[C!=0] s = s_coverage print "Normalized for "+str(normalization) else: print "Error in normalization, using matrix-wise by default" s_norm = np.linalg.norm(s) s /= s_norm #Apply log or power try: s_exp = s**exposant s = s_exp print "Applied "+str(exposant)+" power to matrix" except ValueError: if exposant in ["log10", "log", "ln10"]: s = log10(s.astype(float)) print "Applied base-10 logarithm to matrix" elif exposant in ["ln", "logarithm", "logarithme"]: s = log(s.astype(float)) print "Applied natural logarithm to matrix" elif exposant in ["ln2", "log2"]: s = log2(s.astype(float)) print "Applied base-2 logarithm to matrix" else: print "Warning, no valid normalization function encounter, ignoring" if matrix_name != "normalized": if "correlation" in matrix_name: s_corr = corrcoef(s) s_corr[s_corr<0] = 0 s = s_corr print "Applied correlation function" if matrix_name != "correlation": if not "convolution" in matrix_name: print "Error in matrix mode, using raw by default" s = mat else: print "Convoluting..." for i in range(0,gaussian_number): s_gauss = ndimage.filters.gaussian_filter(s,convolution_sigma) s = s_gauss print "Convoluted "+str(i+1)+" time"+str("" if i+1 <= 1 else "s") return s
def corr(a,b): phi0 = a.matrix.flatten() phi1 = b.matrix.flatten() return ma.corrcoef(phi0,phi1)
# Could also do each year's pattern corr rather than cumulative mean DONE # OR, use the ens mean pattern as the pattern to compare against -- not useful I think, # for what we want to know: is the pattern of response for each ensemble member random # or dependent on the boundary condition. # @@ Also, would be nice to have multiple variables in one plot and/or multiple simulations if pattcorryr: # yearly anomaly pattern corr w/ the time mean pattern tmp = fldpseazm[yr, lat > corrlim, ...] - fldcseazmtm[lat > corrlim, ...] else: # time-integrated anomaly pattern corr w/ the end anomaly pattern tmp = np.mean(fldpseazm[0:yr, lat > corrlim, ...], axis=0) - fldcseazmtm[lat > corrlim, ...] tmpmean = fldpseazmtm[lat > corrlim, ...] - fldcseazmtm[lat > corrlim, ...] # the end pattern tmpcorr = ma.corrcoef(tmp.flatten() * weights.flatten(), tmpmean.flatten() * weights.flatten()) plotd[yr] = tmpcorr[0, 1] testd[yr] = cutl.pattcorr( tmp.flatten() * weights.flatten(), tmpmean.flatten() * weights.flatten() ) # @@ same result as built-in method """ from canam4sims_analens.py. modify for here ensmem = fldpdict[sim][moidx,lat>corrlim,...] - fldcdict[sim][moidx,lat>corrlim,...] obsbc = fldp2[moidx,lat>corrlim,...] - fldc2[moidx,lat>corrlim,...] # weight the fields by area areas = cutl.calc_cellareas(lat,lon) areas = areas[lat>corrlim,:] areas = ma.masked_where(lmask[lat>corrlim,:]==-1,areas) weights = areas / np.sum(np.sum(areas,axis=1),axis=0)
def metric(self, sim, obs, time, obsbase = None): return corrcoef(sim, obs)[0, 1] class VarRatio(Metrics):