def fit_linear_reg2(self,X,y): dp=pd.concat([X,y],axis=1) table=MANOVA.from_formula('X.values~ y.values', data=dp).mv_test().results['y.values']['stat'] Wilks_lambda=table.iloc[0,0] F_value=table.iloc[0,3] p_value=table.iloc[0,4] return Wilks_lambda,F_value,p_value,table
def compute_manova_cvg(topdir: str, m: int): # Assemble a large experiment table with all data neighbors = ["5", "10", "15", "20"] tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'] dfs = [] for n in neighbors: for tol in tolerances: casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n casetable = ac.compute_stored_runs(casedir, m, None) casetable['TOL'] = [float(tol)] * 5 casetable['NNN'] = [float(n)] * 5 dfs.append(casetable) df = pd.concat(dfs).reset_index(drop=True) # Perform a regression with the data endog = np.asarray(df[['K', 'N']]) exog = np.asarray(df[['TOL', 'NNN']]) mod = MANOVA.from_formula('K + N ~ TOL + NNN + NNN:TOL', data=df) print(mod) result = mod.mv_test() print(result) return mod
def run_manova(self): # https://stackoverflow.com/questions/51553355/how-to-get-pvalue-from-statsmodels-manova formula = 'cpt1 + dept1 + jelt1 ~ C(a01) + C(a08) + C(a01) * C(a08)' manova = MANOVA.from_formula(formula, self.data.feature_df) manova_model = manova.mv_test() print(type(manova_model)) print(manova_model.summary())
def fit_linear_reg(self,X,y): x=np.ones(X.shape[0]) x=list(x) x=pd.DataFrame(x) x.columns=['constant'] X=pd.concat([X,x],axis=1) dp=pd.concat([X,y],axis=1) table=MANOVA.from_formula('X.values~ y.values', data=dp).mv_test().results['y.values']['stat'] Wilks_lambda=table.iloc[0,0] F_value=table.iloc[0,3] p_value=table.iloc[0,4] return Wilks_lambda,F_value,p_value,table
def mvsExp(exps): #MANOVA mnv = MANOVA.from_formula('rise_times + errors + energy ~ ce', data=exps) print(mnv.mv_test()) #Multiple Linear Regression est = ols(formula='rise_times ~ cr + ce + cs + cg', data=exps).fit() print(est.summary()) est = ols(formula='errors ~ cr + ce + cs + cg', data=exps).fit() print(est.summary()) est = ols(formula='energy ~ cr + ce + cs + cg', data=exps).fit() print(est.summary())
def test_manova_sas_example(): # Results should be the same as figure 4.5 of # https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/ # viewer.htm#statug_introreg_sect012.htm mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X) r = mod.mv_test() assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'], 0.60143661, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'], 0.44702843, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'], 0.58210348, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'], 0.35530890, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'], 0.77, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'], 0.86, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'], 0.75, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'], 1.07, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'], 3, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'], 16, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'], 18, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'], 9.0909, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'], 9, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'], 0.6032, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'], 0.5397, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'], 0.6272, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'], 0.4109, decimal=4)
def test_manova_test_input_validation(): mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X) hypothesis = [('test', np.array([[1, 1, 1]]), None)] mod.mv_test(hypothesis) hypothesis = [('test', np.array([[1, 1]]), None)] assert_raises(ValueError, mod.mv_test, hypothesis) """ assert_raises_regex(ValueError, ('Contrast matrix L should have the same number of ' 'columns as exog! 2 != 3'), mod.mv_test, hypothesis) """ hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))] mod.mv_test(hypothesis) hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))] assert_raises(ValueError, mod.mv_test, hypothesis) """
def test_manova_test_input_validation(): mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X) hypothesis = [('test', np.array([[1, 1, 1]]), None)] mod.mv_test(hypothesis) hypothesis = [('test', np.array([[1, 1]]), None)] assert_raises(ValueError, mod.mv_test, hypothesis) """ assert_raises_regex(ValueError, ('Contrast matrix L should have the same number of ' 'columns as exog! 2 != 3'), mod.mv_test, hypothesis) """ hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))] mod.mv_test(hypothesis) hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))] assert_raises(ValueError, mod.mv_test, hypothesis) """
def test_manova_sas_example(): # Results should be the same as figure 4.5 of # https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/ # viewer.htm#statug_introreg_sect012.htm mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X) r = mod.mv_test() assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'], 0.60143661, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'], 0.44702843, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'], 0.58210348, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'], 0.35530890, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'], 0.77, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'], 0.86, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'], 0.75, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'], 1.07, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'], 3, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'], 16, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'], 18, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'], 9.0909, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'], 9, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'], 0.6032, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'], 0.5397, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'], 0.6272, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'], 0.4109, decimal=4)
def manova(datacol, label, variable_cols): """ Performs a MANOVA to assess for example batch effects: Check if a significant proportion of the data variance is explained by the dataset membership. For more documentation see: https://www.statsmodels.org/stable/generated/statsmodels.multivariate.manova.MANOVA.html :param datacol: A DataCollection object storing the datasets :param label: The name of the label column that will be created and represents the factor in the MANOVA :param variable_cols: A subset of features which shall be used as variables in the MANOVA :return: A multiindex dataframe listing important outcome statistics of the MANOVA. """ # create combined dataframe with dataframe membership as label df_manova = datacol.combine_dfs(label, variable_cols) # construct formula formula = construct_formula(label, variable_cols, label_side="r") return MANOVA.from_formula(formula, df_manova).mv_test().summary()
def Hypo5(): Groups, NbComments = Luxury_vs_NonLuxury(False) df = pd.DataFrame({'Groups': Groups, 'NbComments': NbComments}) print( stats.f_oneway(df['NbComments'][df['Groups'] == 'Luxary'], df['NbComments'][df['Groups'] == 'NonLuxuary'])) #df['Groups'].replace({1: 'Luxary', 2: 'NonLuxuary'}, inplace= True) print(stats.kruskal(Groups, NbComments)) #print(stats.kruskal(df['Groups'].tolist(),df['NbComments'].tolist())) maov = MANOVA.from_formula('Groups ~ C(NbComments)', data=df) print(maov.mv_test()) results = ols('NbComments ~ Groups', data=df).fit() print(results.summary()) aov_table = sm.stats.anova_lm(results, typ=2) print(aov_table) return df
def multivariate_anova(): cpg_data = correct_cpg_data() print(1) cpg_to_bop = cpg_sites_to_bops.get_cpg_to_bop_dictionary(cpg_data) print(2) column_dict = get_column_dict(cpg_to_bop) print(3) del cpg_data del cpg_to_bop file = open('bop_manova.txt', 'w', encoding='utf-8') file.write('BoP_name p_value\n') ages = get_ages() p_val_dic = {} j = 0 for bop_name, column_lst in column_dict.items(): p_val_list = [] size = len(column_lst) if size > 2: for i in range(size - 2): df = DataFrame({ 'cpg1': column_lst[i], 'cpg2': column_lst[i + 1], 'cpg3': column_lst[i + 2], 'age': ages }) model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', df) test = model.mv_test() p_val_list.append(test.results['age']['stat'].values[3, 4]) minimum = min(p_val_list) file.write(bop_name + '\t' + str(minimum) + '\n') # p_val_dic.update({bop_name: minimum}) print(j) j += 1 return p_val_dic
def randomization(self): C, A, X = [], [], [] for i in range(0, self.ni): inc = self.df.iloc[:, 0:1] yr = self.shuffling(self.df.iloc[:, 1:2]) #yr=self.df.iloc[:,1:2] c = self.nc cr = self.shuffling(self.df.iloc[:, 2:c + 2]) xr = self.df.iloc[:, c + 2:] ndr = pd.concat([inc, yr, cr, xr], axis=1) #ndr.to_csv('ndr.csv') if self.no == 1: dfbjr = self.cal_1(ndr.iloc[:, 0:-1], c) elif self.no == 2: dfbjr = self.cal_2(ndr.iloc[:, 0:-1], c) elif self.no == 3: dfbjr = self.cal_3(ndr.iloc[:, 0:-1], c) elif self.no == 4: dfbjr = self.cal_4(ndr.iloc[:, 0:-1], c) #dfbjr.to_csv('dfbjr.csv', index=False) s = self.df.iloc[:, -1:] dfbjr = pd.concat([dfbjr, s], axis=1) #dfbjr.to_csv('sfd2.csv',index=False) dfbjtr = dfbjr[dfbjr['Set'] == 'Sub_train'] #dfbjtr.to_csv('sfd.csv',index=False) xrd = dfbjtr[self.desc] yr = dfbjtr[yr.columns] table = MANOVA.from_formula( 'xrd.values~ yr.values', data=dfbjtr).mv_test().results['yr.values']['stat'] self.model.fit(xrd, yr) ypr = self.model.predict(xrd) acc = accuracy_score(yr, ypr) * 100 C.append(table.iloc[0, 0]) A.append(np.mean(acc)) return C, A
def get_pca_pvalue_manova(PC1,PC2,Y): data = pd.DataFrame({'PC1':PC1,'PC2':PC2,'Y':Y}) maov = MANOVA.from_formula('PC1 + PC2 ~ Y',data) stats = maov.mv_test() return stats.results['Y']['stat']['Pr > F'].iloc[0]
def single(self, item, config, configs_child): if config.experiment.method == Method.heteroskedasticity: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) process_heteroscedasticity(x, y, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.manova: bop_data = config.base_dict[item] raw_cpgs = bop_data['cpg'] passed_cpgs = [ cpg for cpg in raw_cpgs if cpg in config.target_dict ] genes = list(bop_data['gene']) cl = bop_data['class'] method_params = config.experiment.method_params covariates = [] for key, values in method_params.items(): for val in values: covariates.append(val) manova_dict = {} manova_dict.update(config.observables_dict.items()) if len(config.cells_dict) > 0: manova_dict.update(config.cells_dict.items()) for cpg_id in range(0, len(passed_cpgs)): y = self.get_strategy.get_single_base(config, passed_cpgs[cpg_id]) manova_dict[f'cpg{cpg_id}'] = y df = pd.DataFrame(manova_dict) if len(passed_cpgs) > 0: if len(passed_cpgs) > 2: p_values = {} for cov in covariates: p_values[cov] = 1 p_values_wilks = copy.deepcopy(p_values) p_values_pillai_bartlett = copy.deepcopy(p_values) p_values_lawley_hotelling = copy.deepcopy(p_values) p_values_roy = copy.deepcopy(p_values) for w_id in range(0, len(passed_cpgs) - 2): cpg_keys = [] for cpg_id in range(0, 3): cpg_keys.append(f'cpg{w_id + cpg_id}') formula = ' + '.join(cpg_keys) + ' ~ ' + ' + '.join( covariates) manova = MANOVA.from_formula(formula, df) mv_test_res = manova.mv_test() for cov in covariates: pvals = mv_test_res.results[cov]['stat'].values[ 0:4, 4] p_values_wilks[cov] = min(pvals[0], p_values_wilks[cov]) p_values_pillai_bartlett[cov] = min( pvals[1], p_values_pillai_bartlett[cov]) p_values_lawley_hotelling[cov] = min( pvals[2], p_values_lawley_hotelling[cov]) p_values_roy[cov] = min(pvals[3], p_values_roy[cov]) else: p_values = {} for cov in covariates: p_values[cov] = 1 for cpg_id in range(0, len(passed_cpgs)): formula = f'cpg{cpg_id}' + ' ~ ' + ' + '.join( covariates) anova = ols(formula, df).fit() anova_table = sm.stats.anova_lm(anova) for cov_id, cov in enumerate(covariates): p_value = anova_table.values[cov_id, 4] p_values[cov] = min(p_values[cov], p_value) p_values_wilks = copy.deepcopy(p_values) p_values_pillai_bartlett = copy.deepcopy(p_values) p_values_lawley_hotelling = copy.deepcopy(p_values) p_values_roy = copy.deepcopy(p_values) else: p_values = {} for cov in covariates: p_values[cov] = 1 p_values_wilks = copy.deepcopy(p_values) p_values_pillai_bartlett = copy.deepcopy(p_values) p_values_lawley_hotelling = copy.deepcopy(p_values) p_values_roy = copy.deepcopy(p_values) suffix = f'_{config.hash[0:8]}' config.metrics['class' + suffix].append(cl) config.metrics['genes' + suffix].append(';'.join(genes)) for cov in covariates: config.metrics[f'{cov}_p_value_wilks' + suffix].append( p_values_wilks[cov]) config.metrics[f'{cov}_p_value_pillai_bartlett' + suffix].append(p_values_pillai_bartlett[cov]) config.metrics[f'{cov}_p_value_lawley_hotelling' + suffix].append(p_values_lawley_hotelling[cov]) config.metrics[f'{cov}_p_value_roy' + suffix].append( p_values_roy[cov]) elif config.experiment.method == Method.linreg: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) process_linreg(x, y, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.cluster: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) process_cluster(x, y, config.experiment.method_params, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.formula: y = self.get_strategy.get_single_base(config, item) method_params = config.experiment.method_params exog_dict = {} for key, values in method_params.items(): if key == 'cells': for val in values: if val in config.cells_dict: exog_dict[val] = self.get_strategy.get_cell( config, key=val, item=item) else: raise ValueError( f'Wrong cell type in formula: {val}') if key == 'observables': for val in values: if val in config.observables_dict: exog_dict[val] = self.get_strategy.get_observalbe( config, key=val, item=item) else: raise ValueError( f'Wrong observable in formula: {val}') exog_keys = [] for exog_type, exog_data in exog_dict.items(): if config.is_observables_categorical.get(exog_type, False): exog_keys.append('C(' + exog_type + ')') else: exog_keys.append(exog_type) formula = 'cpg ~ ' + ' + '.join(exog_keys) exog_dict['cpg'] = y data_df = pd.DataFrame(exog_dict) reg_res = smf.ols(formula=formula, data=data_df).fit() params = dict(reg_res.params) bse = dict(reg_res.bse) pvalues = dict(reg_res.pvalues) suffix = f'_{config.hash[0:8]}' config.metrics['mean' + suffix].append(np.mean(y)) config.metrics['R2' + suffix].append(reg_res.rsquared) config.metrics['R2_adj' + suffix].append(reg_res.rsquared_adj) for key in params: config.metrics[key + suffix].append(params[key]) config.metrics[key + '_std' + suffix].append(bse[key]) config.metrics[key + '_p_value' + suffix].append(pvalues[key]) elif config.experiment.method == Method.formula_new: y = self.get_strategy.get_single_base(config, item) method_params = config.experiment.method_params formula = method_params['formula'] dict_global = {} dict_global.update(config.observables_dict.items()) if len(config.cells_dict) > 0: dict_global.update(config.cells_dict.items()) dict_global['cpg'] = y data_df = pd.DataFrame(dict_global) reg_res = smf.ols(formula=formula, data=data_df).fit() params = dict(reg_res.params) bse = dict(reg_res.bse) pvalues = dict(reg_res.pvalues) suffix = f'_{config.hash[0:8]}' config.metrics['mean' + suffix].append(np.mean(y)) config.metrics['R2' + suffix].append(reg_res.rsquared) config.metrics['R2_adj' + suffix].append(reg_res.rsquared_adj) for key in params: config.metrics[key + suffix].append(params[key]) config.metrics[key + '_std' + suffix].append(bse[key]) config.metrics[key + '_p_value' + suffix].append(pvalues[key]) elif config.experiment.method == Method.oma: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) lin_x = minmax_scale(x, feature_range=(0.0, 1.0)) lin_y = minmax_scale(y, feature_range=(0.0, 1.0)) tmp_x = minmax_scale(x, feature_range=(1.0, 10.0)) tmp_y = minmax_scale(y, feature_range=(1.0, 10.0)) log_x = np.log10(tmp_x) log_y = np.log10(tmp_y) lin_lin_corr_coeff, lin_lin_p_value = pearsonr(lin_x, lin_y) config.metrics['lin_lin_corr_coeff' + f'_{config.hash[0:8]}'].append(lin_lin_corr_coeff) config.metrics['lin_lin_p_value' + f'_{config.hash[0:8]}'].append(lin_lin_p_value) lin_log_corr_coeff, lin_log_p_value = pearsonr(lin_x, log_y) config.metrics['lin_log_corr_coeff' + f'_{config.hash[0:8]}'].append(lin_log_corr_coeff) config.metrics['lin_log_p_value' + f'_{config.hash[0:8]}'].append(lin_log_p_value) log_lin_corr_coeff, log_lin_p_value = pearsonr(log_x, lin_y) config.metrics['log_lin_corr_coeff' + f'_{config.hash[0:8]}'].append(log_lin_corr_coeff) config.metrics['log_lin_p_value' + f'_{config.hash[0:8]}'].append(log_lin_p_value) log_log_corr_coeff, log_log_p_value = pearsonr(log_x, log_y) config.metrics['log_log_corr_coeff' + f'_{config.hash[0:8]}'].append(log_log_corr_coeff) config.metrics['log_log_p_value' + f'_{config.hash[0:8]}'].append(log_log_p_value) elif config.experiment.method == Method.pbc: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) if len(set(x)) != 2: raise RuntimeError('x variable is not binary in pbc') keys = list(set(x)) d = {k: [] for k in keys} for x_id, x_val in enumerate(x): d[x_val].append(y[x_id]) corr_coeff, p_value = pointbiserialr(x, y) if np.isnan(corr_coeff) or np.isnan(p_value): corr_coeff = 0.0 p_value = 1.0 anova_p_value = 1.0 kw_p_value = 1.0 else: _, anova_p_value = f_oneway(d[keys[0]], d[keys[1]]) _, kw_p_value = kruskal(d[keys[0]], d[keys[1]]) config.metrics['pbc_corr_coeff' + f'_{config.hash[0:8]}'].append(corr_coeff) config.metrics['pbc_p_value' + f'_{config.hash[0:8]}'].append(p_value) config.metrics['anova_p_value' + f'_{config.hash[0:8]}'].append(anova_p_value) config.metrics['kw_p_value' + f'_{config.hash[0:8]}'].append(kw_p_value) elif config.experiment.method == Method.polygon: xs = [] ys = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: update_parent_dict_with_children(metrics_keys, item, config, config_child) x = self.get_strategy.get_target(config_child, item) y = self.get_strategy.get_single_base(config_child, item) xs.append(x) ys.append(y) if config.experiment.method_params['method'] == Method.linreg: process_linreg_polygon(configs_child, item, xs, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method_params['method'] == Method.variance: process_variance_polygon(configs_child, item, xs, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.z_test_linreg: slopes = [] slopes_std = [] num_subs = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: update_parent_dict_with_children(metrics_keys, item, config, config_child) item_id = config_child.advanced_dict[item] slopes.append(config_child.advanced_data[ 'slope' + f'_{config_child.hash[0:8]}'][item_id]) slopes_std.append(config_child.advanced_data[ 'slope_std' + f'_{config_child.hash[0:8]}'][item_id]) num_subs.append( len(config_child.observables_dict[ config_child.attributes.target])) process_z_test_slope(slopes, slopes_std, num_subs, config.metrics, f'_{config.hash[0:8]}') elif config.experiment.method == Method.ancova: x_all = [] y_all = [] category_all = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: x = self.get_strategy.get_target(config_child, item, categorical=False) y = self.get_strategy.get_single_base(config_child, item) x_all += list(x) y_all += list(y) category_all += [ list(string.ascii_lowercase)[configs_child.index( config_child)] ] * len(x) data = {'x': x_all, 'y': y_all, 'category': category_all} df = pd.DataFrame(data) formula = 'y ~ x * C(category)' lm = ols(formula, df) results = lm.fit() suffix = f'_{config.hash[0:8]}' config.metrics['R2' + suffix].append(results.rsquared) config.metrics['R2_adj' + suffix].append(results.rsquared_adj) config.metrics['f_stat' + suffix].append(results.fvalue) config.metrics['prob(f_stat)' + suffix].append(results.f_pvalue) config.metrics['intercept' + suffix].append(results.params[0]) config.metrics['category' + suffix].append(results.params[1]) config.metrics['x' + suffix].append(results.params[2]) config.metrics['x:category' + suffix].append(results.params[3]) config.metrics['intercept_std' + suffix].append(results.bse[0]) config.metrics['category_std' + suffix].append(results.bse[1]) config.metrics['x_std' + suffix].append(results.bse[2]) config.metrics['x:category_std' + suffix].append(results.bse[3]) config.metrics['intercept_pval' + suffix].append( results.pvalues[0]) config.metrics['category_pval' + suffix].append(results.pvalues[1]) config.metrics['x_pval' + suffix].append(results.pvalues[2]) config.metrics['x:category_pval' + suffix].append( results.pvalues[3]) elif config.experiment.method == Method.aggregator: metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: update_parent_dict_with_children(metrics_keys, item, config, config_child) elif config.experiment.method == Method.variance: x = self.get_strategy.get_target(config, item) y = self.get_strategy.get_single_base(config, item) semi_window = config.experiment.method_params['semi_window'] box_b = config.experiment.method_params['box_b'] box_t = config.experiment.method_params['box_t'] process_variance(x, y, semi_window, box_b, box_t, config.metrics, f'_{config.hash[0:8]}') xs = get_box_xs(x) ys_b, ys_t = fit_variance(xs, config.metrics, f'_{config.hash[0:8]}') diff_begin = abs(ys_t[0] - ys_b[0]) diff_end = abs(ys_t[-1] - ys_b[-1]) config.metrics['increasing_div' + f'_{config.hash[0:8]}'].append( max(diff_begin, diff_end) / min(diff_begin, diff_end)) config.metrics['increasing_sub' + f'_{config.hash[0:8]}'].append( abs(diff_begin - diff_end)) if diff_end > diff_begin: config.metrics['increasing_type' + f'_{config.hash[0:8]}'].append(+1) else: config.metrics['increasing_type' + f'_{config.hash[0:8]}'].append(-1) config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux)
def MANOVA_analysis(dict_cpg_bop, dict_bop_cpg): dict_BoP_PValue = {} age = get_ages() file = open("average_beta.txt", "r") file.readline() for line in file: line = line.split() name_cpg = line.pop(0) if name_cpg in dict_cpg_bop: bop = dict_cpg_bop[name_cpg] l = dict_bop_cpg[bop].split(";") if len(l) < 3: continue else: if bop in dict_BoP_PValue: dict_BoP_PValue[bop].append(line) else: dict_BoP_PValue[bop] = [] dict_BoP_PValue[bop].append(line) file = open("DataFrame.txt", "w") print(len(dict_BoP_PValue)) num = 0 for key in dict_BoP_PValue: print(num) num += 1 dict = {} pVal = [] l = len(dict_BoP_PValue[key]) for i in range(0, l - 2): cpg1 = [] cpg2 = [] cpg3 = [] cpg1 = list(np.float_(dict_BoP_PValue[key][i])) cpg2 = list(np.float_(dict_BoP_PValue[key][i + 1])) cpg3 = list(np.float_(dict_BoP_PValue[key][i + 2])) #for j in range(len(dict_BoP_PValue[key][i])): # cpg1.append(float(dict_BoP_PValue[key][i][j])) # cpg2.append(float(dict_BoP_PValue[key][i+1][j])) # cpg3.append(float(dict_BoP_PValue[key][i+2][j])) DatFrame = pd.DataFrame({ 'age': age, 'cpg1': cpg1, 'cpg2': cpg2, 'cpg3': cpg3 }) #DatFrame.to_csv(file, header=None, index = None, sep=' ', mode='a') #DatFrame.to_csv(file, sep=' ', mode='a') model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', data=DatFrame) test = model.mv_test() pVal.append(test.results['age']['stat'].values[3, 4]) pVal.sort() min_pVal = pVal[0] dict_BoP_PValue[key] = min_pVal ''' age = get_ages() for i in range(1): dict = {} cpg1 = [] cpg2 = [] cpg3 = [] for i in range(728): tmp = random.random() cpg2.append(tmp) cpg3.append(tmp) cpg1.append(tmp) tmp = 0.954697456795 cpg2.append(tmp) cpg3.append(tmp+0.000001) cpg1.append(tmp+0.00001) #DatFrame = pd.DataFrame({'age': age, # 'cpg1': cpg1, # 'cpg2': cpg2, # 'cpg3': cpg3 # }) dict['age'] = age; dict['cpg1'] = cpg1 dict['cpg2'] = cpg2 dict['cpg3'] = cpg3 #print(DatFrame) model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', data=dict) test = model.mv_test() res = test.results['age']['stat'].values[1,4] print(res) ''' return dict_BoP_PValue
def significanceTesting(featureDf2, pairwiseClustersToCompare, confidence=0.05, foldchange=2, responseCutoff=0.1, errorCorrection='bonferroni'): n = len(featureDf2.columns) - 1 if errorCorrection == 'bonferroni': alpha = confidence / n else: alpha = confidence uniqueClusters = [ list(x) for x in set(tuple(x) for x in pairwiseClustersToCompare) ] #Kruskal Wallis is unecessary; one way anova seems to be relatively robust to non-normality: http://www.biostathandbook.com/kruskalwallis.html endog = featureDf2.iloc[:, :-1] exog = featureDf2.iloc[:, -1] modelFormula = " + ".join("Q(\'" + featureDf2.columns[:-1] + "\')") + " ~ Cluster" print(featureDf2) sys.exit(0) manova = MANOVA.from_formula(modelFormula, data=featureDf2) #Pillai's trace is most robust against deviations from assumptions of manova manovapval = manova.mv_test().results['Cluster']['stat'].iloc[1, 4] print(manovapval) #Need to think about how to handle multiple clusters; for now just iterate through all pairs if manovapval < confidence: allDataMatrices = [] allSignificantDifferences = [] for clustersToCompare in pairwiseClustersToCompare: comp1 = clustersToCompare[0] comp2 = clustersToCompare[1] group1 = featureDf2[featureDf2['Cluster'] == str( comp1)].iloc[:, :-1] group2 = featureDf2[featureDf2['Cluster'] == str( comp2)].iloc[:, :-1] anova = scipy.stats.kruskal(group1, group2) pval2 = anova[1] stat = anova[0] if pval2 < 0.01: print('Different') significantArray = [] allBoxPairs = [] pvalList = [] meanFoldChangeList = [] medianFoldChangeList = [] foldChangeList = [] normalityList = [] tempnormalityList = [] for col in range(featureDf2.shape[1] - 1): group1 = featureDf2[featureDf2['Cluster'] == str( comp1)].iloc[:, col] group2 = featureDf2[featureDf2['Cluster'] == str( comp2)].iloc[:, col] normalitypval = shapiro(group1)[1] normalitypval2 = shapiro(group2)[1] normalityCondition = False if normalitypval < 0.05 and normalitypval2 < 0.05: normalityCondition = True try: pval = scipy.stats.ttest_ind(group1, group2)[1] except: pval = 0.5 else: try: pval = scipy.stats.mannwhitneyu(group1, group2)[1] except: pval = 0.5 pvalList.append(pval) tempnormalityList.append(normalityCondition) #For holm bonferroni ordered_pval_list = sorted(pvalList) for col in range(featureDf2.shape[1] - 1): pvalCondition = False foldChangeCondition = False group1 = featureDf2[featureDf2['Cluster'] == str( comp1)].iloc[:, col] group2 = featureDf2[featureDf2['Cluster'] == str( comp2)].iloc[:, col] pval = pvalList[col] if errorCorrection != 'holm-bonferroni': if pval < alpha: pvalCondition = True else: rank = ordered_pval_list.index(pval) + 1 modifiedAlpha = alpha / (n - rank + 1) if pval < modifiedAlpha: pvalCondition = True normalityCondition = tempnormalityList[col] if normalityCondition: if np.nanmean(group1) < responseCutoff: if np.nanmean(group2) >= responseCutoff: meanFoldChangeList.append(4) foldChangeList.append(4) else: meanFoldChangeList.append(0.0001) foldChangeList.append(0.0001) else: if np.nanmean(group2) < responseCutoff: meanFoldChangeList.append(4) foldChangeList.append(4) else: meanFoldChangeList.append( np.nanmean(group1) / np.nanmean(group2)) foldChangeList.append( np.nanmean(group1) / np.nanmean(group2)) else: if np.nanmedian(group1) < responseCutoff: if np.nanmedian(group2) >= responseCutoff: medianFoldChangeList.append(4) foldChangeList.append(4) else: medianFoldChangeList.append(0.0001) foldChangeList.append(0.0001) else: if np.nanmedian(group2) < responseCutoff: medianFoldChangeList.append(4) foldChangeList.append(4) else: medianFoldChangeList.append( np.nanmedian(group1) / np.nanmedian(group2)) foldChangeList.append( np.nanmedian(group1) / np.nanmedian(group2)) if pvalCondition: if abs(np.log2(foldChangeList[-1])) >= np.log2(foldchange): significantArray.append( featureDf2.columns.get_level_values('Feature') [col]) allBoxPairs.append( ((featureDf2.columns.get_level_values('Feature') [col], str(comp1)), (featureDf2.columns.get_level_values('Feature') [col], str(comp2)))) normalityList.append(normalityCondition) foldChangeArray = np.log2(np.array(foldChangeList)) pvalArray = -np.log10(np.array(pvalList)) dataMatrix = np.vstack([foldChangeArray, pvalArray]) allSignificantDifferences.append(significantArray) allDataMatrices.append(dataMatrix) significantArray = list(set().union(*allSignificantDifferences)) dataMatrix = np.vstack(allDataMatrices) else: significantArray = [] dataMatrix = [] print(significantArray) return dataMatrix, significantArray
df["strike_count"] = df["strike_count"].str.replace("s_count_", "") # ------------ Check 2nd component --------------- n = [len(df.get_group(gr)) for gr in groups] c = 1 y = embeddings[:, c - 1] plt.scatter(n, y) plt.show() # ------------ MANOVA ------------------- manova = MANOVA.from_formula( "c0+c1+c2+c3+c4+c5+c6+c7+c8+c9~umpire+ball_count*strike_count", data=df) table = manova.mv_test() res = pd.DataFrame( {term: table.results[term]["stat"].iloc[0] for term in table.results}).T components_names = [ "Smaller", "Uncertain", "High inside excluded", "Wide bottom", "Wide middle", "Wide top", "NW/SE diagonal",
Series = pd.concat([ Series.reset_index(drop=True), pd.DataFrame(indice.tolist(), columns=['y']) ], axis=1) #%% #################################################### # Test MANOVA (diferentes estadísticos) #################################################### import pandas as pd from statsmodels.multivariate.manova import MANOVA maov = MANOVA.from_formula( 'AA+AAL+AAP+AAPL+AB+ABBV+ABC+ABM+ABMD+ABT+ACAD+ACHN+ACIW+ACN+ACOR+ADBE+ADI+ADM+ADP+ADSK+AEE+AEO+AEP+AES+AFL+AG+AGIO+AGN+AIG+AINV+AIV+AKAM+AKS+ALK+ALL+ALNY+AMAT+AMD+AMGN+AMP+AMTD+AMZN+AN+ANTM+APA+APC+ARCC+ARLP+ARNA+ARR+ASH+ATI+ATVI+AUY+AVB+AVP+AVXL+AVY+AWK+AXP+AZN+BABA+BAC+BDX+BUD+CS+DAL+DD+FNMA+GOOG+GOOGL+LH+LLY+LUV+MO+MT+NAT+NLY+NVO+PAA+T+UA+UBS+WBA~ y', data=Series) #%% ############################################################################## # resultado manova Ho igualdad en as medias dado covarianzas ############################################################################## print(maov.mv_test()) #%% ########################## # Test Traicy-Widom ########################## ########################## # Distribución TW F1 ########################## f90 = t_1[F1 >= .90].min()
with open(encoder_path, "rb") as f: _, embeddings, groups, _, _ = pickle.load(f) ids = [groups.index(gr) for gr, _ in df if gr in groups] embeddings = embeddings[ids, :] groups = [groups[i] for i in ids] df = pd.DataFrame(embeddings, index=pd.MultiIndex.from_tuples(groups)).reset_index() df.columns = ["umpire", "score", "inning", *["c" + str(i) for i in range(10)]] # ------------ MANOVA ------------------- manova = MANOVA.from_formula( "c0+c1+c2+c3+c4+c5+c6+c7+c8+c9~umpire+score*inning", data=df) table = manova.mv_test() res = pd.DataFrame( {term: table.results[term]["stat"].iloc[0] for term in table.results}).T components_names = [ "Smaller", "Uncertain", "High inside excluded", "Wide bottom", "Wide middle", "Wide top", "NW/SE diagonal", "Irregular 1",
# MANOVA test in statsmodel import pandas as pd from statsmodels.multivariate.manova import MANOVA # data for t test url = 'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv' df = pd.read_csv(url, index_col=0) df.columns = df.columns.str.replace(".", "_") print(df.head()) # run the manova model maov = MANOVA.from_formula('Sepal_Length + Sepal_Width + \ Petal_Length + Petal_Width ~ Species', data=df) # print out the results print() # print a blank line print(maov.mv_test()) # source # https://www.marsja.se/python-manova-made-easy-using-statsmodels/
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett): dict_bop_cpgs = load_bop_cpg_dict(config) dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs) cpgs, betas = load_cpg_data(config) atr_table = [] atr_cols = [] for atr_type in attributes_types: if isinstance(atr_type, Attribute): atr_table.append(get_attributes(config, atr_type)) elif isinstance(atr_type, CellPop): atr_table.append(get_cell_pop(config, [atr_type])) atr_cols.append(atr_type.value) num_bops = 0 bops_passed = [] bops_pvals = [] for bop in dict_bop_cpgs: curr_cpgs = dict_bop_cpgs.get(bop) cpgs_passed = [] for cpg in curr_cpgs: if cpg in cpgs: cpgs_passed.append(cpg) if len(cpgs_passed) > 2: pvals_on_bop = [] for win_id in range(0, len(cpgs_passed) - 2): val_table = [] val_cols = [] for cpg_id in range(0, window): cpg = cpgs_passed[win_id + cpg_id] beta = betas[cpgs.index(cpg)] val_table.append(beta) val_cols.append('cpg_'+str(cpg_id)) table = atr_table + val_table cols = atr_cols + val_cols formula = val_cols[0] for val_col_id in range(1, len(val_cols)): val_col = val_cols[val_col_id] formula += ' + ' + val_col formula += ' ~ ' + atr_cols[0] for atr_col_id in range(1, len(atr_cols)): atr_col = atr_cols[atr_col_id] formula += ' + ' + atr_col table = list(map(list, zip(*table))) x = pd.DataFrame(table, columns=cols) manova = MANOVA.from_formula(formula, x) mv_test_res = manova.mv_test() pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4] target_pval = pvals[0] if test is MANOVATest.wilks: target_pval = pvals[0] elif test is MANOVATest.pillai_bartlett: target_pval = pvals[1] elif test is MANOVATest.lawley_hotelling: target_pval = pvals[2] elif test is MANOVATest.roy: target_pval = pvals[3] pvals_on_bop.append(target_pval) min_pval = np.min(pvals_on_bop) bops_passed.append(bop) bops_pvals.append(min_pval) num_bops += 1 if num_bops % config.print_rate == 0: print('num_bops: ' + str(num_bops)) reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh') order = np.argsort(pvals_corrected) bops_opt = list(np.array(bops_passed)[order])[0:num_top] pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top] genes_opt = [] genes_from_bop = [] for bop in bops_opt: curr_genes = dict_bop_genes.get(bop) genes_str = curr_genes[0] for gene_id in range(1, len(curr_genes)): genes_str += ';' + curr_genes[gene_id] genes_opt.append(genes_str) for gene in curr_genes: if gene not in genes_from_bop: genes_from_bop.append(gene) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [bops_opt, genes_opt, pvals_opt]) config.approach_gd = GeneDataType.from_bop config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_from_bop]) config.dt = DataType.cpg
import sys import pandas as pd from statsmodels.multivariate.manova import MANOVA if __name__ == '__main__': data_file_path = sys.argv[1] df = pd.read_excel(data_file_path) dependent_variables = sys.argv[2] independent_variable = sys.argv[3] maov = MANOVA.from_formula(dependent_variables + '~' + independent_variable, data=df) print(maov.mv_test())