def test_manova_no_formula_no_hypothesis(): # Same as previous test only skipping formula interface exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True)) endog = X[['Basal', 'Occ', 'Max']] mod = MANOVA(endog, exog) r = mod.mv_test() assert isinstance(r, MultivariateTestResults)
def test_manova_no_formula(): # Same as previous test only skipping formula interface exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True)) endog = X[['Basal', 'Occ', 'Max']] mod = MANOVA(endog, exog) intercept = np.zeros((1, 3)) intercept[0, 0] = 1 loc = np.zeros((2, 3)) loc[0, 1] = loc[1, 2] = 1 hypotheses = [('Intercept', intercept), ('Loc', loc)] r = mod.mv_test(hypotheses) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'], 0.60143661, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'], 0.44702843, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'], 0.58210348, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'], 0.35530890, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'], 0.77, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'], 0.86, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'], 0.75, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'], 1.07, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'], 3, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'], 16, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'], 18, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'], 9.0909, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'], 9, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'], 0.6032, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'], 0.5397, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'], 0.6272, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'], 0.4109, decimal=4)
def runFeatureReduce() : orig_stdout = sys.stdout f = open('./best/manova.txt', 'w') sys.stdout = f print("Loading dataset...") X, y = loadDataset() maov = MANOVA(X,y) print(len(X)) print(len(X[0])) print(len(y)) print(maov.mv_test()) est = sm.OLS(y, X) est2 = est.fit() print(est2.summary()) cases=[] controls=[] for i in range (0,len(y)): valuesTemp=[] for j in range (0,len(X[0])): valuesTemp.append(X[i,j]) if(y[i]==0): controls.append(valuesTemp) else: cases.append(valuesTemp) controls=np.asarray(controls) cases=np.asarray(cases) ttest,pval = stats.f_oneway(controls,cases) print("p-value ANOVA",pval) pd.DataFrame(pval).to_csv("./pANOVA.csv", header=None, index =None) ttest,pval = stats.ttest_ind(controls,cases) print("p-value Two sampled T-test",pval) pd.DataFrame(pval).to_csv("./pttestInd.csv", header=None, index =None) meanControls=np.mean(controls, axis=0) print(meanControls) pd.DataFrame(meanControls).to_csv("./meanControls.csv", header=None, index =None) meanCases=np.mean(cases, axis=0) print(meanCases) pd.DataFrame(meanCases).to_csv("./meanCases.csv", header=None, index =None) sys.stdout = orig_stdout f.close() return
class MANOVAAnalyzer: """Multivariate ANOVA analyzer class.""" def __init__(self, independent_variables, dependent_variables): """Initializes and fits the model.""" self.model = MANOVA(dependent_variables, independent_variables) self.model.fit() def analyze(self): """Applies and tests MANOVA for the given data.""" #self.model.mv_test() is of type MultivariateTestResults return MANOVAAnalysis(self.model.mv_test())
def get_best_thresh_for_layer(min_p1, min_p2, x_org, thresh_abs_mag, method=1, constrain=0, cur_layer=0, supervised=False, truth=[], res=40, plot=False): partition_measures=[] tick=1/res thresholds_values = np.array([]) margin = (1-thresh_abs_mag)/2 for i in range(1, res): temp=tick * i if(temp>=(margin) and temp<=(1-margin)): thresholds_values=np.append(thresholds_values, temp) thresholds_values=thresholds_values.round(decimals=3) for thresh in thresholds_values: (clust_1, trash) = BGC.basic_consensus_two(min_p1, min_p2, BGC.jaccard_distance, thresh) clust_1 = BGC.output_to_array(clust_1, x_org.shape[0]) try: manova = MANOVA(endog=x_org, exog=clust_1) man_out=manova.mv_test().results man_f_res=man_out['x0']['stat']['F Value']['Hotelling-Lawley trace'] except ValueError: man_f_res = 0 measure = [] if(len(set(clust_1))>1): measure.append(silhouette_score(x_org, clust_1)) measure.append(calinski_harabasz_score(x_org, clust_1)) measure.append(davies_bouldin_score(x_org, clust_1)) measure.append(len(set(clust_1))) measure.append(man_f_res) if(supervised): measure.append(f1_score(truth, clust_1, average='micro')) else: measure.append(0) measure.append(0) measure.append(1) measure.append(1) measure.append(0) if(supervised): measure.append(0) partition_measures.append(measure) if(constrain!=0): output=naive_max_thresh(thresholds_values, partition_measures, tick, method=method, cur_layer=cur_layer, constrain=constrain, plot=plot, supervised=supervised) else: output=naive_max_thresh(thresholds_values, partition_measures, tick, method=method, cur_layer=cur_layer, plot=plot, supervised=supervised) return output
def compute_manova_cvg(topdir: str, m: int): # Assemble a large experiment table with all data neighbors = ["5", "10", "15", "20"] tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'] dfs = [] for n in neighbors: for tol in tolerances: casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n casetable = ac.compute_stored_runs(casedir, m, None) casetable['TOL'] = [float(tol)] * 5 casetable['NNN'] = [float(n)] * 5 dfs.append(casetable) df = pd.concat(dfs).reset_index(drop=True) # Perform a regression with the data endog = np.asarray(df[['K', 'N']]) exog = np.asarray(df[['TOL', 'NNN']]) mod = MANOVA.from_formula('K + N ~ TOL + NNN + NNN:TOL', data=df) print(mod) result = mod.mv_test() print(result) return mod
def run_manova(self): # https://stackoverflow.com/questions/51553355/how-to-get-pvalue-from-statsmodels-manova formula = 'cpt1 + dept1 + jelt1 ~ C(a01) + C(a08) + C(a01) * C(a08)' manova = MANOVA.from_formula(formula, self.data.feature_df) manova_model = manova.mv_test() print(type(manova_model)) print(manova_model.summary())
def fit_linear_reg2(self,X,y): dp=pd.concat([X,y],axis=1) table=MANOVA.from_formula('X.values~ y.values', data=dp).mv_test().results['y.values']['stat'] Wilks_lambda=table.iloc[0,0] F_value=table.iloc[0,3] p_value=table.iloc[0,4] return Wilks_lambda,F_value,p_value,table
def global_threshold_consensus(partition_list_in, x_org, constrain=0, thresh_abs_mag=1, method=1, res=40, plot=False, supervised=False, truth=[]): partition_measures=[] tick=1/res thresholds_values = np.array([]) margin = (1-thresh_abs_mag)/2 for i in range(1, res): temp=tick * i if(temp>=(margin) and temp<=(1-margin)): thresholds_values=np.append(thresholds_values, temp) thresholds_values=thresholds_values.round(decimals=3) print("Beginning Analysis of ideal global threshold value...") for thresh in thresholds_values: (clust_1, trash) = BGC.basic_consensus(partition_list_in, thresh) clust_1 = BGC.output_to_array(clust_1, x_org.shape[0]) measure = [] try: manova = MANOVA(endog=x_org, exog=clust_1) man_out=manova.mv_test().results man_f_res=man_out['x0']['stat']['F Value']['Hotelling-Lawley trace'] except ValueError: man_f_res = 0 if(len(set(clust_1))>1): measure.append(silhouette_score(x_org, clust_1)) measure.append(calinski_harabasz_score(x_org, clust_1)) measure.append(davies_bouldin_score(x_org, clust_1)) measure.append(len(set(clust_1))) measure.append(man_f_res) if(supervised): measure.append(f1_score(truth, clust_1, average='micro')) else: measure.append(0) measure.append(0) measure.append(1) measure.append(1) measure.append(0) if(supervised): measure.append(0) partition_measures.append(measure) best = naive_max_thresh(thresholds_values, partition_measures, tick, method=method, constrain=constrain, plot=plot, supervised=supervised) out = BGC.basic_consensus(partition_list_in, best) print("Threshold:",best, ": Number of Clusters",(len(set(BGC.output_to_array(out[0], x_org.shape[0]))))) print("Consensus Achieved") return out
def fit_linear_reg(self,X,y): x=np.ones(X.shape[0]) x=list(x) x=pd.DataFrame(x) x.columns=['constant'] X=pd.concat([X,x],axis=1) dp=pd.concat([X,y],axis=1) table=MANOVA.from_formula('X.values~ y.values', data=dp).mv_test().results['y.values']['stat'] Wilks_lambda=table.iloc[0,0] F_value=table.iloc[0,3] p_value=table.iloc[0,4] return Wilks_lambda,F_value,p_value,table
def mvsExp(exps): #MANOVA mnv = MANOVA.from_formula('rise_times + errors + energy ~ ce', data=exps) print(mnv.mv_test()) #Multiple Linear Regression est = ols(formula='rise_times ~ cr + ce + cs + cg', data=exps).fit() print(est.summary()) est = ols(formula='errors ~ cr + ce + cs + cg', data=exps).fit() print(est.summary()) est = ols(formula='energy ~ cr + ce + cs + cg', data=exps).fit() print(est.summary())
def test_manova_test_input_validation(): mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X) hypothesis = [('test', np.array([[1, 1, 1]]), None)] mod.mv_test(hypothesis) hypothesis = [('test', np.array([[1, 1]]), None)] assert_raises(ValueError, mod.mv_test, hypothesis) """ assert_raises_regex(ValueError, ('Contrast matrix L should have the same number of ' 'columns as exog! 2 != 3'), mod.mv_test, hypothesis) """ hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))] mod.mv_test(hypothesis) hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))] assert_raises(ValueError, mod.mv_test, hypothesis) """
def test_manova_sas_example(): # Results should be the same as figure 4.5 of # https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/ # viewer.htm#statug_introreg_sect012.htm mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X) r = mod.mv_test() assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'], 0.60143661, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'], 0.44702843, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'], 0.58210348, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'], 0.35530890, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'], 0.77, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'], 0.86, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'], 0.75, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'], 1.07, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'], 3, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'], 16, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'], 18, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'], 9.0909, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'], 9, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'], 0.6032, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'], 0.5397, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'], 0.6272, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'], 0.4109, decimal=4)
def manova(datacol, label, variable_cols): """ Performs a MANOVA to assess for example batch effects: Check if a significant proportion of the data variance is explained by the dataset membership. For more documentation see: https://www.statsmodels.org/stable/generated/statsmodels.multivariate.manova.MANOVA.html :param datacol: A DataCollection object storing the datasets :param label: The name of the label column that will be created and represents the factor in the MANOVA :param variable_cols: A subset of features which shall be used as variables in the MANOVA :return: A multiindex dataframe listing important outcome statistics of the MANOVA. """ # create combined dataframe with dataframe membership as label df_manova = datacol.combine_dfs(label, variable_cols) # construct formula formula = construct_formula(label, variable_cols, label_side="r") return MANOVA.from_formula(formula, df_manova).mv_test().summary()
def Hypo5(): Groups, NbComments = Luxury_vs_NonLuxury(False) df = pd.DataFrame({'Groups': Groups, 'NbComments': NbComments}) print( stats.f_oneway(df['NbComments'][df['Groups'] == 'Luxary'], df['NbComments'][df['Groups'] == 'NonLuxuary'])) #df['Groups'].replace({1: 'Luxary', 2: 'NonLuxuary'}, inplace= True) print(stats.kruskal(Groups, NbComments)) #print(stats.kruskal(df['Groups'].tolist(),df['NbComments'].tolist())) maov = MANOVA.from_formula('Groups ~ C(NbComments)', data=df) print(maov.mv_test()) results = ols('NbComments ~ Groups', data=df).fit() print(results.summary()) aov_table = sm.stats.anova_lm(results, typ=2) print(aov_table) return df
def test_statistic(self, matrix_X, matrix_Y): """ Computes the Manova test statistic between two datasets. - uses statsmodels.multivariate.manova's implementation :param matrix_X: a [n*p] data matrix, a matrix with n samples in p dimensions, where p >= 2 :type matrix_X: 2D numpy.array :param matrix_Y: a [n*q] data matrix, a matrix with n samples in q dimensions :type matrix_Y: 2D numpy.array :return: returns a list of two items, that contains: - :test_statistic: the manova test statistic - :test_statistic_metadata: (optional) a ``dict`` of metadata that the independence tests computes in the process :rtype: float, dict **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.manova import Manova >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 2) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312]).reshape(-1, 1) >>> manova = Manova() >>> manova_stat = manova.test_statistic(X, Y) """ assert matrix_X.shape[0] == matrix_Y.shape[ 0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be equal to q" # use Pillai's trace to compute MANOVA self.test_statistic_ = MANOVA( matrix_X, matrix_Y).mv_test().results['x0']['stat'].values[1, 0] self.test_statistic_metadata_ = {} return self.test_statistic_, self.test_statistic_metadata_
def multivariate_anova(): cpg_data = correct_cpg_data() print(1) cpg_to_bop = cpg_sites_to_bops.get_cpg_to_bop_dictionary(cpg_data) print(2) column_dict = get_column_dict(cpg_to_bop) print(3) del cpg_data del cpg_to_bop file = open('bop_manova.txt', 'w', encoding='utf-8') file.write('BoP_name p_value\n') ages = get_ages() p_val_dic = {} j = 0 for bop_name, column_lst in column_dict.items(): p_val_list = [] size = len(column_lst) if size > 2: for i in range(size - 2): df = DataFrame({ 'cpg1': column_lst[i], 'cpg2': column_lst[i + 1], 'cpg3': column_lst[i + 2], 'age': ages }) model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', df) test = model.mv_test() p_val_list.append(test.results['age']['stat'].values[3, 4]) minimum = min(p_val_list) file.write(bop_name + '\t' + str(minimum) + '\n') # p_val_dic.update({bop_name: minimum}) print(j) j += 1 return p_val_dic
def randomization(self): C, A, X = [], [], [] for i in range(0, self.ni): inc = self.df.iloc[:, 0:1] yr = self.shuffling(self.df.iloc[:, 1:2]) #yr=self.df.iloc[:,1:2] c = self.nc cr = self.shuffling(self.df.iloc[:, 2:c + 2]) xr = self.df.iloc[:, c + 2:] ndr = pd.concat([inc, yr, cr, xr], axis=1) #ndr.to_csv('ndr.csv') if self.no == 1: dfbjr = self.cal_1(ndr.iloc[:, 0:-1], c) elif self.no == 2: dfbjr = self.cal_2(ndr.iloc[:, 0:-1], c) elif self.no == 3: dfbjr = self.cal_3(ndr.iloc[:, 0:-1], c) elif self.no == 4: dfbjr = self.cal_4(ndr.iloc[:, 0:-1], c) #dfbjr.to_csv('dfbjr.csv', index=False) s = self.df.iloc[:, -1:] dfbjr = pd.concat([dfbjr, s], axis=1) #dfbjr.to_csv('sfd2.csv',index=False) dfbjtr = dfbjr[dfbjr['Set'] == 'Sub_train'] #dfbjtr.to_csv('sfd.csv',index=False) xrd = dfbjtr[self.desc] yr = dfbjtr[yr.columns] table = MANOVA.from_formula( 'xrd.values~ yr.values', data=dfbjtr).mv_test().results['yr.values']['stat'] self.model.fit(xrd, yr) ypr = self.model.predict(xrd) acc = accuracy_score(yr, ypr) * 100 C.append(table.iloc[0, 0]) A.append(np.mean(acc)) return C, A
def mmr_with_fig(endog, exog, dataset, basepath): manova = MANOVA(endog=endog, exog=exog) manova.mv_test().summary_frame.to_csv( f"{basepath}/multivariate_results.csv") results = manova.mv_test().results sig_key = [] for key, (_, output) in zip(manova.mv_test().exog_names, results.items()): p_val = output["stat"]["Pr > F"][0] key = (" ").join(key.split("_")) if p_val < 0.05: sig_key.append((key, p_val)) # partial eta square f_val = output["stat"]["F Value"][0] den_df = output["stat"]["Den DF"][0] num_df = output["stat"]["Num DF"][0] par_eta_sqr = num_df * f_val / (num_df * f_val + den_df) print("partical eta squared of {}: {}".format(key, par_eta_sqr)) if not sig_key: sig_key.append(("None", "N/A")) df_coef = pd.DataFrame() df_pval = pd.DataFrame() iv_formula = " + ".join(exog.columns.tolist()) for dv in manova.endog_names: univeriate = smf.ols(formula=f"{dv} ~ {iv_formula}", data=dataset).fit() print(univeriate.summary()) p_adjust = multipletests(univeriate.pvalues, alpha=0.05, method="bonferroni") df_coef = df_coef.append(univeriate.params, ignore_index=True) df_p_adjust = pd.DataFrame( np.array([p_adjust[0], p_adjust[1]]).T, index=["Intercept"] + exog.columns.tolist(), columns=["Sig.", "p_adjusted"], ) df_pval = df_pval.append(df_p_adjust.iloc[:, 1], ignore_index=True) print(df_p_adjust) print("Bonferroni corrected alpha (0.05): {}\n".format( multipletests(univeriate.pvalues, alpha=0.05, method="bonferroni")[-1])) df_coef.index = manova.endog_names df_pval.index = df_coef.index df_coef.columns = ["Intercept"] + exog.columns.tolist() plt.figure(figsize=(13, 7)) sns.heatmap( df_coef.iloc[:, 1:], cmap="PiYG_r", square=False, center=0, annot=df_pval.iloc[:, 1:], ) plt.title("Full univariate results") plt.annotate( f""" * Value in each cell is Bonferroni corrected p-value. ** {sig_key[0][0]} is significant at multivatiate level. p = {sig_key[0][1]}""", (0, 0), (0, -70), xycoords="axes fraction", textcoords="offset points", va="top", ) plt.tight_layout() plt.savefig(f"{basepath}/univeriate.png", dpi=300, transparent=True) return df_coef
def significanceTesting(featureDf2, pairwiseClustersToCompare, confidence=0.05, foldchange=2, responseCutoff=0.1, errorCorrection='bonferroni'): n = len(featureDf2.columns) - 1 if errorCorrection == 'bonferroni': alpha = confidence / n else: alpha = confidence uniqueClusters = [ list(x) for x in set(tuple(x) for x in pairwiseClustersToCompare) ] #Kruskal Wallis is unecessary; one way anova seems to be relatively robust to non-normality: http://www.biostathandbook.com/kruskalwallis.html endog = featureDf2.iloc[:, :-1] exog = featureDf2.iloc[:, -1] modelFormula = " + ".join("Q(\'" + featureDf2.columns[:-1] + "\')") + " ~ Cluster" print(featureDf2) sys.exit(0) manova = MANOVA.from_formula(modelFormula, data=featureDf2) #Pillai's trace is most robust against deviations from assumptions of manova manovapval = manova.mv_test().results['Cluster']['stat'].iloc[1, 4] print(manovapval) #Need to think about how to handle multiple clusters; for now just iterate through all pairs if manovapval < confidence: allDataMatrices = [] allSignificantDifferences = [] for clustersToCompare in pairwiseClustersToCompare: comp1 = clustersToCompare[0] comp2 = clustersToCompare[1] group1 = featureDf2[featureDf2['Cluster'] == str( comp1)].iloc[:, :-1] group2 = featureDf2[featureDf2['Cluster'] == str( comp2)].iloc[:, :-1] anova = scipy.stats.kruskal(group1, group2) pval2 = anova[1] stat = anova[0] if pval2 < 0.01: print('Different') significantArray = [] allBoxPairs = [] pvalList = [] meanFoldChangeList = [] medianFoldChangeList = [] foldChangeList = [] normalityList = [] tempnormalityList = [] for col in range(featureDf2.shape[1] - 1): group1 = featureDf2[featureDf2['Cluster'] == str( comp1)].iloc[:, col] group2 = featureDf2[featureDf2['Cluster'] == str( comp2)].iloc[:, col] normalitypval = shapiro(group1)[1] normalitypval2 = shapiro(group2)[1] normalityCondition = False if normalitypval < 0.05 and normalitypval2 < 0.05: normalityCondition = True try: pval = scipy.stats.ttest_ind(group1, group2)[1] except: pval = 0.5 else: try: pval = scipy.stats.mannwhitneyu(group1, group2)[1] except: pval = 0.5 pvalList.append(pval) tempnormalityList.append(normalityCondition) #For holm bonferroni ordered_pval_list = sorted(pvalList) for col in range(featureDf2.shape[1] - 1): pvalCondition = False foldChangeCondition = False group1 = featureDf2[featureDf2['Cluster'] == str( comp1)].iloc[:, col] group2 = featureDf2[featureDf2['Cluster'] == str( comp2)].iloc[:, col] pval = pvalList[col] if errorCorrection != 'holm-bonferroni': if pval < alpha: pvalCondition = True else: rank = ordered_pval_list.index(pval) + 1 modifiedAlpha = alpha / (n - rank + 1) if pval < modifiedAlpha: pvalCondition = True normalityCondition = tempnormalityList[col] if normalityCondition: if np.nanmean(group1) < responseCutoff: if np.nanmean(group2) >= responseCutoff: meanFoldChangeList.append(4) foldChangeList.append(4) else: meanFoldChangeList.append(0.0001) foldChangeList.append(0.0001) else: if np.nanmean(group2) < responseCutoff: meanFoldChangeList.append(4) foldChangeList.append(4) else: meanFoldChangeList.append( np.nanmean(group1) / np.nanmean(group2)) foldChangeList.append( np.nanmean(group1) / np.nanmean(group2)) else: if np.nanmedian(group1) < responseCutoff: if np.nanmedian(group2) >= responseCutoff: medianFoldChangeList.append(4) foldChangeList.append(4) else: medianFoldChangeList.append(0.0001) foldChangeList.append(0.0001) else: if np.nanmedian(group2) < responseCutoff: medianFoldChangeList.append(4) foldChangeList.append(4) else: medianFoldChangeList.append( np.nanmedian(group1) / np.nanmedian(group2)) foldChangeList.append( np.nanmedian(group1) / np.nanmedian(group2)) if pvalCondition: if abs(np.log2(foldChangeList[-1])) >= np.log2(foldchange): significantArray.append( featureDf2.columns.get_level_values('Feature') [col]) allBoxPairs.append( ((featureDf2.columns.get_level_values('Feature') [col], str(comp1)), (featureDf2.columns.get_level_values('Feature') [col], str(comp2)))) normalityList.append(normalityCondition) foldChangeArray = np.log2(np.array(foldChangeList)) pvalArray = -np.log10(np.array(pvalList)) dataMatrix = np.vstack([foldChangeArray, pvalArray]) allSignificantDifferences.append(significantArray) allDataMatrices.append(dataMatrix) significantArray = list(set().union(*allSignificantDifferences)) dataMatrix = np.vstack(allDataMatrices) else: significantArray = [] dataMatrix = [] print(significantArray) return dataMatrix, significantArray
df["strike_count"] = df["strike_count"].str.replace("s_count_", "") # ------------ Check 2nd component --------------- n = [len(df.get_group(gr)) for gr in groups] c = 1 y = embeddings[:, c - 1] plt.scatter(n, y) plt.show() # ------------ MANOVA ------------------- manova = MANOVA.from_formula( "c0+c1+c2+c3+c4+c5+c6+c7+c8+c9~umpire+ball_count*strike_count", data=df) table = manova.mv_test() res = pd.DataFrame( {term: table.results[term]["stat"].iloc[0] for term in table.results}).T components_names = [ "Smaller", "Uncertain", "High inside excluded", "Wide bottom", "Wide middle", "Wide top", "NW/SE diagonal",
Series = pd.concat([ Series.reset_index(drop=True), pd.DataFrame(indice.tolist(), columns=['y']) ], axis=1) #%% #################################################### # Test MANOVA (diferentes estadísticos) #################################################### import pandas as pd from statsmodels.multivariate.manova import MANOVA maov = MANOVA.from_formula( 'AA+AAL+AAP+AAPL+AB+ABBV+ABC+ABM+ABMD+ABT+ACAD+ACHN+ACIW+ACN+ACOR+ADBE+ADI+ADM+ADP+ADSK+AEE+AEO+AEP+AES+AFL+AG+AGIO+AGN+AIG+AINV+AIV+AKAM+AKS+ALK+ALL+ALNY+AMAT+AMD+AMGN+AMP+AMTD+AMZN+AN+ANTM+APA+APC+ARCC+ARLP+ARNA+ARR+ASH+ATI+ATVI+AUY+AVB+AVP+AVXL+AVY+AWK+AXP+AZN+BABA+BAC+BDX+BUD+CS+DAL+DD+FNMA+GOOG+GOOGL+LH+LLY+LUV+MO+MT+NAT+NLY+NVO+PAA+T+UA+UBS+WBA~ y', data=Series) #%% ############################################################################## # resultado manova Ho igualdad en as medias dado covarianzas ############################################################################## print(maov.mv_test()) #%% ########################## # Test Traicy-Widom ########################## ########################## # Distribución TW F1 ########################## f90 = t_1[F1 >= .90].min()
def manova(test_row, data, categorical): data = data.dropna() data.loc[len(data)] = test_row le = LabelEncoder() for val in categorical: data[val] = le.fit_transform(data[val]) for col in data.columns: if (col not in categorical): data[col] = (data[col] - np.mean(data[col])) / np.std(data[col]) test_row = data.iloc[len(data) - 1] data.drop([len(data) - 1]) data_good = data[data[10] == 0] data_bad = data[data[10] == 1] x_good = data_good.drop([10, 9], axis=1) y_good = data_good[[9]] x_bad = data_bad.drop([10, 9], axis=1) y_bad = data_bad[[9]] man_good = MANOVA(endog=x_good, exog=y_good) man_bad = MANOVA(endog=x_bad, exog=y_bad) output_good = man_good.mv_test() output_bad = man_bad.mv_test() out_good = np.array(output_good['x0']['stat']) out_bad = np.array(output_bad['x0']['stat']) WL_good = out_good[0][0] PT_good = out_good[1][0] HT_good = out_good[2][0] RGR_good = out_good[3][0] WL_bad = out_bad[0][0] PT_bad = out_bad[1][0] HT_bad = out_bad[2][0] RGR_bad = out_bad[3][0] x = test_row.drop([10, 9]) y = test_row[[9]] data_test_x = x_good.append(x) data_test_y = y_good.append(y) man_test = MANOVA(endog=data_test_x, exog=data_test_y) output_test = man_test.mv_test() out_test = np.array(output_test['x0']['stat']) WL_test_good = out_test[0][0] PT_test_good = out_test[1][0] HT_test_good = out_test[2][0] RGR_test_good = out_test[3][0] data_test_x = x_bad.append(x) data_test_y = y_bad.append(y) man_test = MANOVA(endog=data_test_x, exog=data_test_y) output_test = man_test.mv_test() out_test = np.array(output_test['x0']['stat']) WL_test_bad = out_test[0][0] PT_test_bad = out_test[1][0] HT_test_bad = out_test[2][0] RGR_test_bad = out_test[3][0] scorecard = { "method": "MANOVA", "WL_good": WL_good, "WL_test_good": WL_test_good, "WL_bad": WL_bad, "WL_test_bad": WL_test_bad } ret = "WL good : " + str(WL_good) + " WL test good : " + str( WL_test_good) + "\nWL bad : " + str(WL_bad) + " WL test bad : " + str( WL_test_bad) return scorecard
with open(encoder_path, "rb") as f: _, embeddings, groups, _, _ = pickle.load(f) ids = [groups.index(gr) for gr, _ in df if gr in groups] embeddings = embeddings[ids, :] groups = [groups[i] for i in ids] df = pd.DataFrame(embeddings, index=pd.MultiIndex.from_tuples(groups)).reset_index() df.columns = ["umpire", "score", "inning", *["c" + str(i) for i in range(10)]] # ------------ MANOVA ------------------- manova = MANOVA.from_formula( "c0+c1+c2+c3+c4+c5+c6+c7+c8+c9~umpire+score*inning", data=df) table = manova.mv_test() res = pd.DataFrame( {term: table.results[term]["stat"].iloc[0] for term in table.results}).T components_names = [ "Smaller", "Uncertain", "High inside excluded", "Wide bottom", "Wide middle", "Wide top", "NW/SE diagonal", "Irregular 1",
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett): dict_bop_cpgs = load_bop_cpg_dict(config) dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs) cpgs, betas = load_cpg_data(config) atr_table = [] atr_cols = [] for atr_type in attributes_types: if isinstance(atr_type, Attribute): atr_table.append(get_attributes(config, atr_type)) elif isinstance(atr_type, CellPop): atr_table.append(get_cell_pop(config, [atr_type])) atr_cols.append(atr_type.value) num_bops = 0 bops_passed = [] bops_pvals = [] for bop in dict_bop_cpgs: curr_cpgs = dict_bop_cpgs.get(bop) cpgs_passed = [] for cpg in curr_cpgs: if cpg in cpgs: cpgs_passed.append(cpg) if len(cpgs_passed) > 2: pvals_on_bop = [] for win_id in range(0, len(cpgs_passed) - 2): val_table = [] val_cols = [] for cpg_id in range(0, window): cpg = cpgs_passed[win_id + cpg_id] beta = betas[cpgs.index(cpg)] val_table.append(beta) val_cols.append('cpg_'+str(cpg_id)) table = atr_table + val_table cols = atr_cols + val_cols formula = val_cols[0] for val_col_id in range(1, len(val_cols)): val_col = val_cols[val_col_id] formula += ' + ' + val_col formula += ' ~ ' + atr_cols[0] for atr_col_id in range(1, len(atr_cols)): atr_col = atr_cols[atr_col_id] formula += ' + ' + atr_col table = list(map(list, zip(*table))) x = pd.DataFrame(table, columns=cols) manova = MANOVA.from_formula(formula, x) mv_test_res = manova.mv_test() pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4] target_pval = pvals[0] if test is MANOVATest.wilks: target_pval = pvals[0] elif test is MANOVATest.pillai_bartlett: target_pval = pvals[1] elif test is MANOVATest.lawley_hotelling: target_pval = pvals[2] elif test is MANOVATest.roy: target_pval = pvals[3] pvals_on_bop.append(target_pval) min_pval = np.min(pvals_on_bop) bops_passed.append(bop) bops_pvals.append(min_pval) num_bops += 1 if num_bops % config.print_rate == 0: print('num_bops: ' + str(num_bops)) reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh') order = np.argsort(pvals_corrected) bops_opt = list(np.array(bops_passed)[order])[0:num_top] pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top] genes_opt = [] genes_from_bop = [] for bop in bops_opt: curr_genes = dict_bop_genes.get(bop) genes_str = curr_genes[0] for gene_id in range(1, len(curr_genes)): genes_str += ';' + curr_genes[gene_id] genes_opt.append(genes_str) for gene in curr_genes: if gene not in genes_from_bop: genes_from_bop.append(gene) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [bops_opt, genes_opt, pvals_opt]) config.approach_gd = GeneDataType.from_bop config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_from_bop]) config.dt = DataType.cpg
# MANOVA test in statsmodel import pandas as pd from statsmodels.multivariate.manova import MANOVA # data for t test url = 'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv' df = pd.read_csv(url, index_col=0) df.columns = df.columns.str.replace(".", "_") print(df.head()) # run the manova model maov = MANOVA.from_formula('Sepal_Length + Sepal_Width + \ Petal_Length + Petal_Width ~ Species', data=df) # print out the results print() # print a blank line print(maov.mv_test()) # source # https://www.marsja.se/python-manova-made-easy-using-statsmodels/
def single(self, item, config, configs_child): if config.experiment.method == Method.heteroskedasticity: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) process_heteroscedasticity(x, y, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.manova: bop_data = config.base_dict[item] raw_cpgs = bop_data['cpg'] passed_cpgs = [ cpg for cpg in raw_cpgs if cpg in config.target_dict ] genes = list(bop_data['gene']) cl = bop_data['class'] method_params = config.experiment.method_params covariates = [] for key, values in method_params.items(): for val in values: covariates.append(val) manova_dict = {} manova_dict.update(config.observables_dict.items()) if len(config.cells_dict) > 0: manova_dict.update(config.cells_dict.items()) for cpg_id in range(0, len(passed_cpgs)): y = self.get_strategy.get_single_base(config, passed_cpgs[cpg_id]) manova_dict[f'cpg{cpg_id}'] = y df = pd.DataFrame(manova_dict) if len(passed_cpgs) > 0: if len(passed_cpgs) > 2: p_values = {} for cov in covariates: p_values[cov] = 1 p_values_wilks = copy.deepcopy(p_values) p_values_pillai_bartlett = copy.deepcopy(p_values) p_values_lawley_hotelling = copy.deepcopy(p_values) p_values_roy = copy.deepcopy(p_values) for w_id in range(0, len(passed_cpgs) - 2): cpg_keys = [] for cpg_id in range(0, 3): cpg_keys.append(f'cpg{w_id + cpg_id}') formula = ' + '.join(cpg_keys) + ' ~ ' + ' + '.join( covariates) manova = MANOVA.from_formula(formula, df) mv_test_res = manova.mv_test() for cov in covariates: pvals = mv_test_res.results[cov]['stat'].values[ 0:4, 4] p_values_wilks[cov] = min(pvals[0], p_values_wilks[cov]) p_values_pillai_bartlett[cov] = min( pvals[1], p_values_pillai_bartlett[cov]) p_values_lawley_hotelling[cov] = min( pvals[2], p_values_lawley_hotelling[cov]) p_values_roy[cov] = min(pvals[3], p_values_roy[cov]) else: p_values = {} for cov in covariates: p_values[cov] = 1 for cpg_id in range(0, len(passed_cpgs)): formula = f'cpg{cpg_id}' + ' ~ ' + ' + '.join( covariates) anova = ols(formula, df).fit() anova_table = sm.stats.anova_lm(anova) for cov_id, cov in enumerate(covariates): p_value = anova_table.values[cov_id, 4] p_values[cov] = min(p_values[cov], p_value) p_values_wilks = copy.deepcopy(p_values) p_values_pillai_bartlett = copy.deepcopy(p_values) p_values_lawley_hotelling = copy.deepcopy(p_values) p_values_roy = copy.deepcopy(p_values) else: p_values = {} for cov in covariates: p_values[cov] = 1 p_values_wilks = copy.deepcopy(p_values) p_values_pillai_bartlett = copy.deepcopy(p_values) p_values_lawley_hotelling = copy.deepcopy(p_values) p_values_roy = copy.deepcopy(p_values) suffix = f'_{config.hash[0:8]}' config.metrics['class' + suffix].append(cl) config.metrics['genes' + suffix].append(';'.join(genes)) for cov in covariates: config.metrics[f'{cov}_p_value_wilks' + suffix].append( p_values_wilks[cov]) config.metrics[f'{cov}_p_value_pillai_bartlett' + suffix].append(p_values_pillai_bartlett[cov]) config.metrics[f'{cov}_p_value_lawley_hotelling' + suffix].append(p_values_lawley_hotelling[cov]) config.metrics[f'{cov}_p_value_roy' + suffix].append( p_values_roy[cov]) elif config.experiment.method == Method.linreg: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) process_linreg(x, y, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.cluster: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) process_cluster(x, y, config.experiment.method_params, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.formula: y = self.get_strategy.get_single_base(config, item) method_params = config.experiment.method_params exog_dict = {} for key, values in method_params.items(): if key == 'cells': for val in values: if val in config.cells_dict: exog_dict[val] = self.get_strategy.get_cell( config, key=val, item=item) else: raise ValueError( f'Wrong cell type in formula: {val}') if key == 'observables': for val in values: if val in config.observables_dict: exog_dict[val] = self.get_strategy.get_observalbe( config, key=val, item=item) else: raise ValueError( f'Wrong observable in formula: {val}') exog_keys = [] for exog_type, exog_data in exog_dict.items(): if config.is_observables_categorical.get(exog_type, False): exog_keys.append('C(' + exog_type + ')') else: exog_keys.append(exog_type) formula = 'cpg ~ ' + ' + '.join(exog_keys) exog_dict['cpg'] = y data_df = pd.DataFrame(exog_dict) reg_res = smf.ols(formula=formula, data=data_df).fit() params = dict(reg_res.params) bse = dict(reg_res.bse) pvalues = dict(reg_res.pvalues) suffix = f'_{config.hash[0:8]}' config.metrics['mean' + suffix].append(np.mean(y)) config.metrics['R2' + suffix].append(reg_res.rsquared) config.metrics['R2_adj' + suffix].append(reg_res.rsquared_adj) for key in params: config.metrics[key + suffix].append(params[key]) config.metrics[key + '_std' + suffix].append(bse[key]) config.metrics[key + '_p_value' + suffix].append(pvalues[key]) elif config.experiment.method == Method.formula_new: y = self.get_strategy.get_single_base(config, item) method_params = config.experiment.method_params formula = method_params['formula'] dict_global = {} dict_global.update(config.observables_dict.items()) if len(config.cells_dict) > 0: dict_global.update(config.cells_dict.items()) dict_global['cpg'] = y data_df = pd.DataFrame(dict_global) reg_res = smf.ols(formula=formula, data=data_df).fit() params = dict(reg_res.params) bse = dict(reg_res.bse) pvalues = dict(reg_res.pvalues) suffix = f'_{config.hash[0:8]}' config.metrics['mean' + suffix].append(np.mean(y)) config.metrics['R2' + suffix].append(reg_res.rsquared) config.metrics['R2_adj' + suffix].append(reg_res.rsquared_adj) for key in params: config.metrics[key + suffix].append(params[key]) config.metrics[key + '_std' + suffix].append(bse[key]) config.metrics[key + '_p_value' + suffix].append(pvalues[key]) elif config.experiment.method == Method.oma: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) lin_x = minmax_scale(x, feature_range=(0.0, 1.0)) lin_y = minmax_scale(y, feature_range=(0.0, 1.0)) tmp_x = minmax_scale(x, feature_range=(1.0, 10.0)) tmp_y = minmax_scale(y, feature_range=(1.0, 10.0)) log_x = np.log10(tmp_x) log_y = np.log10(tmp_y) lin_lin_corr_coeff, lin_lin_p_value = pearsonr(lin_x, lin_y) config.metrics['lin_lin_corr_coeff' + f'_{config.hash[0:8]}'].append(lin_lin_corr_coeff) config.metrics['lin_lin_p_value' + f'_{config.hash[0:8]}'].append(lin_lin_p_value) lin_log_corr_coeff, lin_log_p_value = pearsonr(lin_x, log_y) config.metrics['lin_log_corr_coeff' + f'_{config.hash[0:8]}'].append(lin_log_corr_coeff) config.metrics['lin_log_p_value' + f'_{config.hash[0:8]}'].append(lin_log_p_value) log_lin_corr_coeff, log_lin_p_value = pearsonr(log_x, lin_y) config.metrics['log_lin_corr_coeff' + f'_{config.hash[0:8]}'].append(log_lin_corr_coeff) config.metrics['log_lin_p_value' + f'_{config.hash[0:8]}'].append(log_lin_p_value) log_log_corr_coeff, log_log_p_value = pearsonr(log_x, log_y) config.metrics['log_log_corr_coeff' + f'_{config.hash[0:8]}'].append(log_log_corr_coeff) config.metrics['log_log_p_value' + f'_{config.hash[0:8]}'].append(log_log_p_value) elif config.experiment.method == Method.pbc: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) if len(set(x)) != 2: raise RuntimeError('x variable is not binary in pbc') keys = list(set(x)) d = {k: [] for k in keys} for x_id, x_val in enumerate(x): d[x_val].append(y[x_id]) corr_coeff, p_value = pointbiserialr(x, y) if np.isnan(corr_coeff) or np.isnan(p_value): corr_coeff = 0.0 p_value = 1.0 anova_p_value = 1.0 kw_p_value = 1.0 else: _, anova_p_value = f_oneway(d[keys[0]], d[keys[1]]) _, kw_p_value = kruskal(d[keys[0]], d[keys[1]]) config.metrics['pbc_corr_coeff' + f'_{config.hash[0:8]}'].append(corr_coeff) config.metrics['pbc_p_value' + f'_{config.hash[0:8]}'].append(p_value) config.metrics['anova_p_value' + f'_{config.hash[0:8]}'].append(anova_p_value) config.metrics['kw_p_value' + f'_{config.hash[0:8]}'].append(kw_p_value) elif config.experiment.method == Method.polygon: xs = [] ys = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: update_parent_dict_with_children(metrics_keys, item, config, config_child) x = self.get_strategy.get_target(config_child, item) y = self.get_strategy.get_single_base(config_child, item) xs.append(x) ys.append(y) if config.experiment.method_params['method'] == Method.linreg: process_linreg_polygon(configs_child, item, xs, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method_params['method'] == Method.variance: process_variance_polygon(configs_child, item, xs, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.z_test_linreg: slopes = [] slopes_std = [] num_subs = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: update_parent_dict_with_children(metrics_keys, item, config, config_child) item_id = config_child.advanced_dict[item] slopes.append(config_child.advanced_data[ 'slope' + f'_{config_child.hash[0:8]}'][item_id]) slopes_std.append(config_child.advanced_data[ 'slope_std' + f'_{config_child.hash[0:8]}'][item_id]) num_subs.append( len(config_child.observables_dict[ config_child.attributes.target])) process_z_test_slope(slopes, slopes_std, num_subs, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.ancova: x_all = [] y_all = [] category_all = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: x = self.get_strategy.get_target(config_child, item, categorical=False) y = self.get_strategy.get_single_base(config_child, item) x_all += list(x) y_all += list(y) category_all += [ list(string.ascii_lowercase)[configs_child.index( config_child)] ] * len(x) data = {'x': x_all, 'y': y_all, 'category': category_all} df = pd.DataFrame(data) formula = 'y ~ x * C(category)' lm = ols(formula, df) results = lm.fit() suffix = f'_{config.hash[0:8]}' config.metrics['R2' + suffix].append(results.rsquared) config.metrics['R2_adj' + suffix].append(results.rsquared_adj) config.metrics['f_stat' + suffix].append(results.fvalue) config.metrics['prob(f_stat)' + suffix].append(results.f_pvalue) config.metrics['intercept' + suffix].append(results.params[0]) config.metrics['category' + suffix].append(results.params[1]) config.metrics['x' + suffix].append(results.params[2]) config.metrics['x:category' + suffix].append(results.params[3]) config.metrics['intercept_std' + suffix].append(results.bse[0]) config.metrics['category_std' + suffix].append(results.bse[1]) config.metrics['x_std' + suffix].append(results.bse[2]) config.metrics['x:category_std' + suffix].append(results.bse[3]) config.metrics['intercept_pval' + suffix].append( results.pvalues[0]) config.metrics['category_pval' + suffix].append(results.pvalues[1]) config.metrics['x_pval' + suffix].append(results.pvalues[2]) config.metrics['x:category_pval' + suffix].append( results.pvalues[3]) elif config.experiment.method == Method.aggregator: metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: update_parent_dict_with_children(metrics_keys, item, config, config_child) elif config.experiment.method == Method.variance: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) semi_window = config.experiment.method_params['semi_window'] box_b = config.experiment.method_params['box_b'] box_t = config.experiment.method_params['box_t'] process_variance(x, y, semi_window, box_b, box_t, config.metrics, f'_{config.hash[0:8]}') xs = get_box_xs(x) ys_b, ys_t = fit_variance(xs, config.metrics, f'_{config.hash[0:8]}') diff_begin = abs(ys_t[0] - ys_b[0]) diff_end = abs(ys_t[-1] - ys_b[-1]) config.metrics['increasing_div' + f'_{config.hash[0:8]}'].append( max(diff_begin, diff_end) / min(diff_begin, diff_end)) config.metrics['increasing_sub' + f'_{config.hash[0:8]}'].append( abs(diff_begin - diff_end)) if diff_end > diff_begin: config.metrics['increasing_type' + f'_{config.hash[0:8]}'].append(+1) else: config.metrics['increasing_type' + f'_{config.hash[0:8]}'].append(-1) config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux)
def MANOVA_analysis(dict_cpg_bop, dict_bop_cpg): dict_BoP_PValue = {} age = get_ages() file = open("average_beta.txt", "r") file.readline() for line in file: line = line.split() name_cpg = line.pop(0) if name_cpg in dict_cpg_bop: bop = dict_cpg_bop[name_cpg] l = dict_bop_cpg[bop].split(";") if len(l) < 3: continue else: if bop in dict_BoP_PValue: dict_BoP_PValue[bop].append(line) else: dict_BoP_PValue[bop] = [] dict_BoP_PValue[bop].append(line) file = open("DataFrame.txt", "w") print(len(dict_BoP_PValue)) num = 0 for key in dict_BoP_PValue: print(num) num += 1 dict = {} pVal = [] l = len(dict_BoP_PValue[key]) for i in range(0, l - 2): cpg1 = [] cpg2 = [] cpg3 = [] cpg1 = list(np.float_(dict_BoP_PValue[key][i])) cpg2 = list(np.float_(dict_BoP_PValue[key][i + 1])) cpg3 = list(np.float_(dict_BoP_PValue[key][i + 2])) #for j in range(len(dict_BoP_PValue[key][i])): # cpg1.append(float(dict_BoP_PValue[key][i][j])) # cpg2.append(float(dict_BoP_PValue[key][i+1][j])) # cpg3.append(float(dict_BoP_PValue[key][i+2][j])) DatFrame = pd.DataFrame({ 'age': age, 'cpg1': cpg1, 'cpg2': cpg2, 'cpg3': cpg3 }) #DatFrame.to_csv(file, header=None, index = None, sep=' ', mode='a') #DatFrame.to_csv(file, sep=' ', mode='a') model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', data=DatFrame) test = model.mv_test() pVal.append(test.results['age']['stat'].values[3, 4]) pVal.sort() min_pVal = pVal[0] dict_BoP_PValue[key] = min_pVal ''' age = get_ages() for i in range(1): dict = {} cpg1 = [] cpg2 = [] cpg3 = [] for i in range(728): tmp = random.random() cpg2.append(tmp) cpg3.append(tmp) cpg1.append(tmp) tmp = 0.954697456795 cpg2.append(tmp) cpg3.append(tmp+0.000001) cpg1.append(tmp+0.00001) #DatFrame = pd.DataFrame({'age': age, # 'cpg1': cpg1, # 'cpg2': cpg2, # 'cpg3': cpg3 # }) dict['age'] = age; dict['cpg1'] = cpg1 dict['cpg2'] = cpg2 dict['cpg3'] = cpg3 #print(DatFrame) model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', data=dict) test = model.mv_test() res = test.results['age']['stat'].values[1,4] print(res) ''' return dict_BoP_PValue
all_feat_names = [key for key in PARAMS['all_featName']] opFile = PARAMS['opDir'] + '/MANOVA.csv' for feat_i in all_feat_names: feat_train = All_feature_data[feat_i]['train_data'] feat_test = All_feature_data[feat_i]['test_data'] feat_train_label = All_feature_data[feat_i]['train_label'] print('MANOVA ', feat_i, np.shape(feat_train), np.shape(feat_train_label), np.shape(feat_test)) PARAMS_temp = PARAMS.copy() print('feat_train: ', np.shape(feat_train)) # endog~dependent variables, exog~independent variables try: moav = MANOVA(endog=feat_train, exog=feat_train_label) test_results = moav.mv_test() except: print(feat_i, ' Noise added') feat_train += np.random.rand( np.shape(feat_train)[0], np.shape(feat_train)[1]) * 1e-10 moav = MANOVA(endog=feat_train, exog=feat_train_label) test_results = moav.mv_test() WL = test_results.results['x0']['stat']['Value']['Wilks\' lambda'] PT = test_results.results['x0']['stat']['Value']['Pillai\'s trace'] HLT = test_results.results['x0']['stat']['Value'][ 'Hotelling-Lawley trace'] RGR = test_results.results['x0']['stat']['Value'][ 'Roy\'s greatest root']
def get_pca_pvalue_manova(PC1,PC2,Y): data = pd.DataFrame({'PC1':PC1,'PC2':PC2,'Y':Y}) maov = MANOVA.from_formula('PC1 + PC2 ~ Y',data) stats = maov.mv_test() return stats.results['Y']['stat']['Pr > F'].iloc[0]
def __init__(self, independent_variables, dependent_variables): """Initializes and fits the model.""" self.model = MANOVA(dependent_variables, independent_variables) self.model.fit()