def end_epoch(self): self.all_gts['course'] = np.concatenate(self.all_gts['course']) self.all_preds['course'] = np.concatenate(self.all_preds['course']) self.all_gts['accel'] = np.concatenate(self.all_gts['accel']) self.all_preds['accel'] = np.concatenate(self.all_preds['accel']) course_correl = dcor.distance_correlation(self.all_gts['course'], self.all_preds['course']) accel_correl = dcor.distance_correlation(self.all_gts['accel'], self.all_preds['accel']) Logger().log_value('%s_epoch.course_correl' % self.mode, course_correl, should_print=True) Logger().log_value('%s_epoch.accel_correl' % self.mode, accel_correl, should_print=True) self.all_gts = {'course': [], 'accel': []} self.all_preds = {'course': [], 'accel': []} return None
def calculate_if(X, thresh=0.8): variables = list(X.columns) variables.sort() variables_keep = [] variables_drop = [] variables_remaining = variables.copy() for var in variables: X_remaining = X[variables_remaining] if var in variables_drop: continue print(f'target variable: {var}') idx = X_remaining.columns.get_loc(var) x_i = X_remaining.iloc[:, idx].values k_vars = X_remaining.shape[1] mask = np.arange(k_vars) != idx x_noti = X_remaining.iloc[:, mask] ds = [ dcor.distance_correlation(x_i, x_noti_i) for x_noti_i in x_noti.T.values ] remaining_variables = x_noti.columns for d, remaining_var in zip(ds, remaining_variables): if d >= thresh: variables_drop.append(remaining_var) variables_remaining.remove(remaining_var) print( f'dropping {remaining_var} with dcor {np.round(d, 2)}') variables_remaining.remove(var) variables_keep.append(var) X = X[variables_keep] print(f'dropped {len(variables_drop)} variables, kept {X.shape[1]}') return X, variables_drop
def distance_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame: tickers = df.columns.tolist() df_dcor = pd.DataFrame(index=tickers, columns=tickers) k = 0 for i in tickers: v_i = df.loc[:, i].values v_i = np.array([i for i in v_i]) for j in tickers[k:]: v_j = df.loc[:, j].values v_j = np.array([j for j in v_j]) dcor_val = dcor.distance_correlation(v_i, v_j) df_dcor.at[i, j] = dcor_val df_dcor.at[j, i] = dcor_val k += 1 # dcor_matrix = matrix_power(df_dcor.to_numpy(), 2) dcor_matrix = expm(df_dcor.to_numpy()) df_expdcor = pd.DataFrame(dcor_matrix) df_expdcor.columns = df_dcor.columns df_expdcor.index = df_dcor.index return df_expdcor
def get_percentiles_by_bins(ss_column, corr_column, invalid_lines, n_bins=500): bin_vals = [[] for i in range(n_bins)] #bin_ids = [[] for i in range(n_bins)] for i in range(len(ss_column)): if not i in invalid_lines: ss_val = ss_column[i] corr_val = corr_column[i] corr_bin = int(round(corr_val*n_bins,0)) if corr_bin >= n_bins: corr_bin = n_bins - 1 #bin_ids[corr_bin].append(i) bin_vals[corr_bin].append(ss_val) x = [] medians = [] top_quantile = [] bottom_quantile = [] for i in range(n_bins): if len(bin_vals[i]) > 0: x1 = i/n_bins x2 = (i+1)/n_bins x.append((x1,x2)) bin_vals[i].sort() top_quantile.append(np.percentile(bin_vals[i], 75)) medians.append(np.percentile(bin_vals[i], 50)) bottom_quantile.append(np.percentile(bin_vals[i], 25)) '''else: top_quantile.append(0.0) medians.append(0.0) bottom_quantile.append(0.0)''' corr = dcor.distance_correlation(np.array([a for a,b in x]), np.array(medians)) return x, top_quantile, medians, bottom_quantile, corr
def _get_dcorr(self, array_resid): """Return distance correlation coefficient. The variables are transformed to uniform marginals using the empirical cumulative distribution function beforehand. Here the null distribution is not analytically available, but can be precomputed with the function generate_and_save_nulldists(...) which saves a \*.npz file containing the null distribution for different sample sizes. This file can then be supplied as null_dist_filename. Parameters ---------- array_resid : array-like data array must be of shape (2, T) Returns ------- val : float Distance correlation coefficient. """ # Remove ties before applying transformation to uniform marginals # array_resid = self._remove_ties(array_resid, verbosity=4) x_vals, y_vals = self._trafo2uniform(array_resid) val = dcor.distance_correlation(x_vals, y_vals, method='AVL') return val
def corr_distance(sd_log): # long runtime from scipy.spatial.distance import pdist, squareform import dcor data = sd_log.data feat_names = sd_log.columns.tolist() df_dcor = pd.DataFrame(index=feat_names, columns=feat_names) def compute_matrix(i): v1 = data.loc[:, i].as_matrix() v1_dist = squareform(pdist(v1[:, np.newaxis])) return (i, dcor.double_centered(v1_dist)) k = 0 for feat_i in feat_names: tmp = data.loc[:, feat_i] v1 = data.loc[:, feat_i].to_numpy() for feat_j in feat_names[k:]: v2 = data.loc[:, feat_j].to_numpy() rez = dcor.distance_correlation(v1, v2) df_dcor.at[feat_i, feat_j] = rez df_dcor.at[feat_j, feat_i] = rez k += 1 return df_dcor
def df_distance_correlation(df, stocks): #initializes an empty DataFrame df_dcor = pd.DataFrame(index=stocks, columns=stocks) #initialzes a counter at zero k=0 # iterates over the time series of eachstocks stock for i in stocks: # stores the ith time series as a vector v_i = df.loc[:, i].values # iterates over the time series of each stock subect to the counter k for j in stocks[k:]: # stores the jth time series as a vector v_j = df.loc[:, j].values # computes the dcor coefficient between the ith and jth vectors dcor_val = dcor.distance_correlation(v_i, v_j) # appends the dcor value at every ij entry of the empty DataFrame df_dcor.at[i,j] = dcor_val # appends the dcor value at every ji entry of the empty DataFrame df_dcor.at[j,i] = dcor_val # increments counter by 1 k+=1 # returns a DataFrame of dcor values for every pair of stocks return df_dcor
def _compute_correlation(self, metrics_vals_1: pd.Series, metrics_vals_2: pd.Series, lag: int): return dcor.distance_correlation( metrics_vals_1.astype(float), metrics_vals_2.shift(lag).fillna(0).astype(float), exponent=self.exponent)
def calc_filtered_dist_corr(X: np.ndarray, Y: np.ndarray) -> Tuple[float, float, float]: N, n, frac = 1000, Y.shape[0], 1.0 indices_list, dist_corr_list = [], [] for idx in range(N): indices = np.random.choice(n, size=int(frac * n), replace=True) indices_list.append(indices) for indices in indices_list: dist_corr = dcor.distance_correlation(X[indices], Y[indices]) # distance_corr(X, Y, seed=9) dist_corr_list.append(dist_corr) low_dist_corr, up_dist_corr = np.percentile(dist_corr_list, 2.5), np.percentile(dist_corr_list, 97.5) dist_corr = dcor.distance_correlation(X, Y) # pval = dcor.independence.distance_covariance_test(X, Y, exponent=1.0, num_resamples=100) # dist_corr, pval = distance_corr(X, Y, n_boot=100) # print(f'dco pval: {dist_corr} {pval}') # import ipdb # ipdb.set_trace() return dist_corr, low_dist_corr, up_dist_corr
def dcorr(X, Y): """ Computes Distance Correlation (dCorr) between word embedding matrices X and Y :param x: X: word embedding matrix X with shape (k x D) :param y: Y: word embedding matrix Y with shape (l x D) :return: distance correlation between X and Y """ return dcor.distance_correlation(X.T, Y.T)
def aidc(x, y): cov_y = np.cov(y) cov_x = np.cov(x.T) if cov_x.shape is (): inv_cov_x = 1.0 / cov_x x_trans = np.dot(x, np.sqrt(inv_cov_x)) else: inv_cov_x = np.linalg.inv(cov_x) x_trans = np.dot(x, scipy.linalg.sqrtm(inv_cov_x)) inv_cov_y = 1 / cov_y y_trans = np.dot(y, np.sqrt(inv_cov_y)) return dcor.distance_correlation(x_trans, y_trans)
def AIDC(X, Y): cov_y = numpy.cov(Y) cov_x = numpy.cov(X.T) if cov_x.shape is (): inv_cov_x = 1.0 / cov_x X_trans = numpy.dot(X, numpy.sqrt(inv_cov_x)) else: inv_cov_x = numpy.linalg.inv(cov_x) X_trans = numpy.dot(X, scipy.linalg.sqrtm(inv_cov_x)) inv_cov_y = 1 / cov_y Y_trans = numpy.dot(Y, numpy.sqrt(inv_cov_y)) return dcor.distance_correlation(Y_trans, X_trans)
def corr_distance2(sd_log): # long runtime from scipy.spatial.distance import pdist, squareform import dcor data = sd_log.data feat_names = sd_log.columns.tolist() df_dcor = pd.DataFrame(index=feat_names, columns=feat_names) k = 0 for feat_i in feat_names: tmp = data.loc[:, feat_i] v1 = data.loc[:, feat_i].to_numpy() for feat_j in feat_names[k:]: v2 = data.loc[:, feat_j].to_numpy() rez = dcor.distance_correlation(v1, v2) df_dcor.at[feat_i, feat_j] = float(rez) df_dcor.at[feat_j, feat_i] = float(rez) k += 1 # plot as heatmap fig, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(df_dcor, cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True, cbar_kws={'shrink': .9}, ax=ax, annot=True, linewidths=0.1, vmax=1.0, linecolor='white', annot_kws={'fontsize': 12}) plt.title("Distance Correlation Among Features") plt.show() return df_dcor
def CF(x, y, team, cf_name): """ Available characteristic functions: dcor: Distance correlation between y and x """ x = x[:, team] if len(team) == 0: return 0.0 if cf_name.lower() == "dcor": return dcor.distance_correlation(y, x) elif cf_name.lower() == "r2": det_C_xy = numpy.linalg.det(numpy.corrcoef(x.T, y)) if len(team) == 1: det_C_x = 1 else: det_C_x = numpy.linalg.det(numpy.corrcoef(x.T)) # ------------------------------------ # FOr debugging R2 in Julia #print(f"team={team}") #print(1 - det_C_xy/det_C_x) # ------------------------------------ return (1 - det_C_xy / det_C_x) elif cf_name.lower() == "aidc": return dcor.distance_correlation_af_inv(y, x) #return AIDC(x, y) elif cf_name.lower() == "hsic": return dHSIC(x, y) else: raise NameError( "I don't know the characteristic function {0}".format(cf_name)) return 0
def dist_corr(): df = dtr.detrend() df.dropna(inplace=True) # store the column names as a list col_names = df.columns.tolist() df_dcor = pd.DataFrame(index=col_names, columns=col_names) k = 0 for i in col_names: v1 = df.loc[:, i].values for j in col_names[k:]: v2 = df.loc[:, j].values rez = dcor.distance_correlation(v1, v2) df_dcor.at[i, j] = rez df_dcor.at[j, i] = rez k += 1 return df_dcor
def colwise_partial_distcorr(df, col1: str, partial: str): import dcor pdc_list = [] dc_list = [] ipdc_list = [] idc_list = [] # for col2 in tqdm(df.columns): for col2 in df.columns: dc = dcor.distance_correlation(x=df[col1], y=df[col2]) dc_list.append(dc) if partial is not None: pdc = dcor.partial_distance_correlation(x=df[col1], y=df[col2], z=df[partial]) pdc_list.append(pdc) result_df = pd.DataFrame() result_df['distance_corr'] = dc_list result_df['partial_distance_corr'] = pdc_list result_df['col1'] = col1 result_df['col2'] = df.columns result_df['partial'] = partial return result_df
def test_model(result_model, attrs, test_y, test_x_vars): '''Tests a trained model and randonly selects test points to plot''' result_y = [] score = 0.0 rmse = 0.0 if "poly" in attrs: poly_func = attrs["poly"] x_ = poly_func.fit_transform(test_x_vars) result_y = result_model.predict(x_) score = result_model.score(x_, test_y) rmse = np.sqrt(mean_squared_error(test_y, result_y)) else: result_y = result_model.predict(test_x_vars) score = result_model.score(test_x_vars, test_y) rmse = np.sqrt(mean_squared_error(test_y, result_y)) correlation = dcor.distance_correlation(np.array(test_y), np.array(result_y)) frequencies, bins_x, bins_y = np.histogram2d( result_y, test_y, bins=[np.linspace(0.0, 1.0, 100), np.linspace(0.0, 1.0, 100)]) return score, rmse, correlation, frequencies
def compute_correlation_strength(): """ Computes correlation strengths between pairs of attributes. Works on currently loaded dataset. GET parameters: - "ids": List of embedding IDs to consider, with ids=1,2,3,... Note: If "ids" is not specified, all embeddings are taken into account. :return: """ df: pd.DataFrame = app.config["EMBEDDING_METADATA"]["original"].drop( ["num_records"], axis=1) ids: list = request.args.get("ids") ids = list(map(int, ids.split(","))) if ids is not None else None if ids is not None: df = df.iloc[ids] # todo (remove, generate data cleanly) Hack: Rename target_domain_performance and n_components here. return df.rename(columns={ "target_domain_performance": "rdp", "separability_metric": "separability" }).corr(method=lambda x, y: dcor.distance_correlation(x, y)).to_json( orient='index')
# --- Data #X = numpy.array([numpy.linspace(-1, 1, N) for _ in range(D)]).T X = numpy.array([numpy.random.uniform(-1, 1, N) for _ in range(D)]).T TWO_D = 2 * numpy.array(range(D)) Y = numpy.matmul(numpy.multiply(X, X), TWO_D) # --- # --- Transform data M = numpy.array([numpy.random.uniform(-10, 10, D) for _ in range(D)]) N = numpy.array([numpy.random.uniform(-10, 10, N) for _ in range(D)]).T X_TRANS1 = numpy.matmul(X, M) X_TRANS2 = numpy.matmul(X, M) + N print("Distance correlation:") print(dcor.distance_correlation(Y, X)) print("Unbiased dcor:") print(numpy.sqrt(dcor.u_distance_correlation_sqr(Y, X))) #for _ in range(10000): # AIDC(X, Y) # dcor.distance_correlation_af_inv(Y, X) #print("done") #sys.exit() print("AIDC original X:") print(AIDC(X, Y)) print("AIDC built-in X:") print(dcor.distance_correlation_af_inv(Y, X)) print("AIDC X = M*X:") print(AIDC(X_TRANS1, Y))
def cal_dCor(x, y): #https://github.com/vnmabus/dcor return dcor.distance_correlation(x, y, method='AVL')
def _test_umap(xdata, random_state, **kwargs): transformer = UMAP(random_state=random_state, **kwargs) x = transformer.fit_transform(xdata) return x, random_state, distance_correlation(xdata, x)
# summarize # print('MAE: %.3f' % results.best_score_) print('\n' + 'For: ' + tickers[i18] + ', Lasso Config: %s' % results.best_params_) scores = np.absolute(scores) print('For: ' + tickers[i18] + ', Lasso Mean(STD) MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores))) for i19,column in enumerate(lei_cei_lag_predictor_data): temp_list = lei_cei_lag_predictor_data[tickers[i19]].to_list() temp_df = pd.DataFrame({tickers[i19]:temp_list}) temp_df = temp_df.set_index(lei_cei_lag_data_returns_df.index) temp_dcor_df = pd.concat([lei_cei_lag_data_returns_df.dropna(), temp_df.dropna()], axis=1) temp_X = temp_dcor_df.iloc[:, np.arange(0, temp_dcor_df.shape[1]-1, 1).tolist()] temp_Y = temp_dcor_df.iloc[:, -1] print("For security: " + tickers[i19] + ", distance coerelation is: " + str(round(dcor.distance_correlation(temp_X,temp_Y),2))) for i20,column in enumerate(lei_cei_lag_predictor_data): temp_list = lei_cei_lag_predictor_data[tickers[i20]].to_list() temp_df = pd.DataFrame({tickers[i20]:temp_list}) temp_df = temp_df.set_index(lei_cei_lag_data_returns_df.index) temp_dcov_df = pd.concat([lei_cei_lag_data_returns_df.dropna(), temp_df.dropna()], axis=1) temp_X = temp_dcov_df.iloc[:, np.arange(0, temp_dcov_df.shape[1]-1, 1).tolist()] temp_Y = temp_dcov_df.iloc[:, -1] print("For security: " + tickers[i20] + ", distance covariance is: " + str(round(dcor.distance_covariance(temp_X,temp_Y),2))) # K-Means of 11 ETF returns kmeans = KMeans(n_clusters=3).fit(lei_cei_lag_predictor_data) centroids = kmeans.cluster_centers_ print(centroids)
from scipy.stats import pearsonr, spearmanr, kendalltau for feat in bio_cols: print(feat) print("Pearson's = {}".format( pearsonr(biochemistry_data[feat], questionnaire_data["HYPERTENSION"])[0])) print("Spearman's = {}".format( spearmanr(biochemistry_data[feat], questionnaire_data["HYPERTENSION"])[0])) print("Kendall's Tau = {}\n".format( kendalltau(biochemistry_data[feat], questionnaire_data["HYPERTENSION"])[0])) print("Distance Correlation = {}".format( dcor.distance_correlation(biochemistry_data[feat], questionnaire_data["HYPERTENSION"]))) print("Energy Distance = {}\n".format( dcor.energy_distance(biochemistry_data[feat], questionnaire_data["HYPERTENSION"]))) #Plot correlation matrix using Pearson's correlation measure if False: import seaborn as sns cols = list(biochemistry_data.columns) corr_matrix = np.corrcoef(biochemistry_data[cols].values.T) print(corr_matrix) plt.figure(1, figsize=(12, 18)) sns.set(font_scale=1.0) heat_map = sns.heatmap(corr_matrix, cbar=False, annot=True,
def CV(X, y, d, m, method="ft", nolamb=50, nofold=10, NoB=5, NoC=20, NoW=2, spX=False, standard=False): """ Estimate B using the best lambda with cross-validation Args: X: covariates y: outcome d: structural dimension m: number of transfroms method: "ft" or "sir" nolamb: the number of lambda nofold: the number of fold NoB: number of iterate over B within ADMM NoC: number of iterate over C NoW: number of updating weights spX: sparse X or not standard: standardize X or not Returns: B: estimate covxx: covariance matrix of X lambcv: best lambda minimum loss """ ## par. #method = 'sir' # or 'ft' #nofold = 10 #nolamb = 50 #spX = False #standard = False #NoB = 5 #NoC = 20 #NoW = 2 ## generate lambda candidate lambmax = 1 #np.max(sdr0.M)/10 lambmin = lambmax / 1000 if method == 'sir' else lambmax / 10 lambseq = np.exp(np.linspace(np.log(lambmin), np.log(lambmax), num=nolamb)) kf = KFold(n_splits=nofold) cvloss = np.zeros((nofold, nolamb)) k = 0 for train_index, test_index in kf.split(X): print('Fold-', k) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #print("TRAIN:", (X_train.shape), "TEST:", (X_test.shape)) for i in range(nolamb): Btrain = estimate( X_train, y_train, d, m, lambseq[i], method, NoB, NoC, NoW, spX, standard )[0] # estimate(X, y, d, m, lamb, method = "ft", NoB = 5, NoC = 20, NoW=2, spX=False, standard=False) if np.linalg.cond(Btrain) < 1 / sys.float_info.epsilon: sigs = np.cov(X_train.T) Btrain = Btrain @ la.inv(la.sqrtm(Btrain.T @ sigs @ Btrain)) cvloss[k, i] = 1 - dcor.distance_correlation( X_test @ Btrain, y_test) / d else: cvloss[k, i] = 100 k = k + 1 l_mean = np.mean(cvloss, axis=0) lambcv = lambseq[np.argmin(l_mean)] B, covxx, err = estimate(X, y, d, m, lambcv, method, NoB, NoC, NoW, spX, standard) return B, covxx, lambcv, np.argmin(l_mean)
y_Dior_ny # y_Prada_london # y_Prada_milan # y_Prada_paris # y_Prada_ny brand_city = ['Chanel_Milan','Chanel_London','Chanel_Paris','Chanel_NY', 'Dior_Milan','Dior_London','Dior_Paris','Dior_NY'] dcor_df = pd.DataFrame(data = 0, columns=brand_city, index=brand_city) spearman_df = pd.DataFrame(data = 0, columns=brand_city, index=brand_city) brand_city_signal = [y_Chanel_milan, y_Chanel_london, y_Chanel_paris, y_Chanel_ny, y_Dior_milan, y_Dior_london, y_Dior_paris, y_Dior_ny] for i in range(len(brand_city)): for j in range(len(brand_city)): dcor_df.iloc[i,j] = dcor.distance_correlation(np.array(brand_city_signal[i]), np.array(brand_city_signal[j])) spearman_df.iloc[i,j], p_val = np.abs(spearmanr(np.array(brand_city_signal[i]), np.array(brand_city_signal[j]))) dcor_df.to_csv('dcor_brand_city.csv') spearman_df.to_csv('spearman_brand_city_abs.csv') # Visualize the covariance matrix using a heatmap sns.heatmap(spearman_df, annot=True, cmap='Reds') # Display the heatmap plt.show()
def ascores(X, y): ''' ---------- Parameters ---------- X: numeric dataframe to compute association measure with y y: series containing target values Returns ------- Dataframe with the following association scores: pearson: pearson correlation kendall: kendall correlation spearman: spearman correlation mic: maximal information coefficient dcor: distance correlation Example ------- import exploretransform as et df, X, y = et.loadboston() X = X.select_dtypes('number') et.ascores(X, y) pearson kendall spearman mic dcor lon 0.322947 0.278908 0.420940 0.379753 0.435849 lat 0.006826 0.013724 0.021420 0.234796 0.167030 crim 0.389582 0.406992 0.562982 0.375832 0.528595 zn 0.360386 0.340738 0.438768 0.290145 0.404253 indus 0.484754 0.420263 0.580004 0.414140 0.543948 nox 0.429300 0.398342 0.565899 0.442515 0.523653 rm 0.696304 0.485182 0.635092 0.461610 0.711034 age 0.377999 0.391067 0.551747 0.414676 0.480248 dis 0.249315 0.313745 0.446392 0.316136 0.382746 tax 0.471979 0.418005 0.566999 0.336899 0.518158 ptratio 0.505655 0.397146 0.554168 0.371628 0.520320 b 0.334861 0.126766 0.186011 0.272469 0.385468 lstat 0.740836 0.671445 0.857447 0.615427 0.781028 ---------- ''' # Convert any ints to float for dcor calculation if len(X.select_dtypes(int).columns) > 0: for col in X.select_dtypes(int).columns: X.loc[:, col] = X[col].astype('float') r = pd.DataFrame() mine = MINE(alpha=0.6, c=15) for col in X.columns: mine.compute_score(X[col], y) r.loc[col, 'pearson'] = abs(stats.pearsonr(X[col], y)[0]) r.loc[col, 'kendall'] = abs(stats.kendalltau(X[col], y)[0]) r.loc[col, 'spearman'] = abs(stats.spearmanr(X[col], y)[0]) r.loc[col, 'mic'] = mine.mic() r.loc[col, 'dcor'] = distance_correlation(X[col], y) return r
def main(gpath): rma_df = pd.read_csv("../data-sources/mistry2017/rma") rma_df['raw_gene'] = rma_df['Unnamed: 0'] print(np.sum(rma_df['raw_gene'].str.endswith('_at'))) print(list(rma_df['raw_gene'])) annot_df = pd.read_csv( "../data-sources/mistry2017/DIPtoAffy_with_additionalAnnotations.tsv", sep='\t') print(annot_df.columns) print(annot_df.head()) pid_unitprot = defaultdict(set) for r in annot_df.itertuples(): input_parts = r[-1].split(';') output_parts = r[-3].split(';') for input_part in input_parts: for output_part in output_parts: if output_part != '' and input_part != '': pid_unitprot[input_part].add( res.get_unified_name(output_part)) idmap = {} for k, vals in pid_unitprot.items(): if len(vals) > 1: print("%s -> %s" % (k, vals)) elif len(vals) == 1: idmap[k] = list(vals)[0] print(idmap) print(len(idmap)) ix = rma_df['raw_gene'].isin(idmap) rma_df = rma_df[ix].copy() rma_df['gene'] = [idmap[g] for g in rma_df['raw_gene']] expr_cols = [c for c in rma_df.columns if c.startswith('BT')] G = nx.read_gpickle(gpath) nodes = sorted(G.nodes()) node_ix = dict(zip(nodes, range(len(nodes)))) data = np.array(rma_df[expr_cols]) F = np.zeros((len(nodes), len(nodes))) for i in range(rma_df.shape[0]): node_i = rma_df.iloc[i]['gene'] if node_i not in node_ix: continue for j in range(i + 1, rma_df.shape[0]): node_j = rma_df.iloc[j]['gene'] if node_j in node_ix: node_i_idx = node_ix[node_i] node_j_idx = node_ix[node_j] F[node_i_idx, node_j_idx] = dcor.distance_correlation( data[i, :], data[j, :]) F[node_j_idx, node_i_idx] = F[node_i_idx, node_j_idx] print("Finished %d" % i) #print(np.sum(F)) output_path = "../generated-data/pairwise_features/%s_rma_dcor" % ( os.path.basename(gpath)) np.save(output_path, F)
def dc(reads1, reads2): return dcor.distance_correlation(reads1, reads2)
def ts_corr_network(data, corr_param='pcor', prune=0.35): if corr_param == "dcor": col_names = data.columns.tolist() data_dcor = pd.DataFrame(index=col_names, columns=col_names) k = 0 for i in col_names: v_i = data.loc[:, i].values for j in col_names[k:]: v_j = data.loc[:, j].values dcor_val = dcor.distance_correlation(v_i, v_j) data_dcor.at[i, j] = dcor_val data_dcor.at[j, i] = dcor_val k += 1 # converts the dataframe to a matrix (need this to generate the graph from the networkx package) dcor_matrix = data_dcor.values.astype("float") sim_matrix = 1 - dcor_matrix nodes = data_dcor.index.values # transforms the similarity matrix into a weighted graph G = nx.from_numpy_matrix(sim_matrix) # relabel nodes as the stock names G = nx.relabel_nodes(G, lambda x: nodes[x]) # prints the edges with their corresponding weights G.edges(data=True) # copy correlation network H = G.copy() # remove self-edges from H (required for graph-theoretic analyses) for (u, v) in G.edges: if u == v: H.remove_edge(u, v) if prune != None: # removes weakly correlated edges from H for (u, v, wt) in G.edges.data("weight"): if wt >= 1 - prune: H.remove_edge(u, v) return H if corr_param == "pcor": pcor_matrix = data.iloc[:, 1:].corr() nodes = pcor_matrix.index.values pcor_matrix = np.asmatrix(pcor_matrix) sim_matrix = 1 - abs(pcor_matrix) G = nx.from_numpy_matrix(sim_matrix) G = nx.relabel_nodes(G, lambda x: nodes[x]) G.edges(data=True) H = G.copy() for (u, v) in G.edges: if u == v: H.remove_edge(u, v) if prune != None: for (u, v, wt) in G.edges.data("weight"): if wt >= 1 - prune: H.remove_edge(u, v) return H
csv_path, dropping_col, seeds[i], Normalize_output) print( '-----------------------------------Feature Selection-----------------------------------' ) if with_fs_sel: f_sel, sp_df = spearsman_FS(Input_TR, Output_TR, threshold=threshold_clip, rank_num=rank_clip, FSmode="rank") else: f_sel = Input_TR.columns.values.tolist() run_info[str(i)]['Selected Features'] = f_sel dcor_index_TR = dcor.distance_correlation(Input_TR.loc[:, f_sel], Output_TR) dcor_index_TE = dcor.distance_correlation(Input_TE.loc[:, f_sel], Output_TE) run_info[str(i)]['dCorr_score_TR'] = dcor_index_TR run_info[str(i)]['dCorr_score_TE'] = dcor_index_TE print('dcor_index_TR: ', dcor_index_TR) print('dcor_index_TE: ', dcor_index_TE) scaled_Input_TR, scaler_TR_df = scale_data(Input_TR) scaled_Input_TE, scaler_TE_df = scale_data(Input_TE) # Prepare TR and TE X_TR = scaled_Input_TR[f_sel].values Y_TR = Output_TR.values X_TE = scaled_Input_TE[f_sel].values Y_TE = Output_TE.values