def run(esetSC2, anoSC2, Control_exp, sPTD_exp_sample, PPROM_exp_sample, maxGeneCount): sPTD_log2FC = [ math.log2(np.mean(Control_exp.iloc[c])) - math.log2(np.mean(sPTD_exp_sample.iloc[c])) for c in range(Control_exp.shape[0]) ] PPROM_log2FC = [ math.log2(np.mean(Control_exp.iloc[c])) - math.log2(np.mean(PPROM_exp_sample.iloc[c])) for c in range(Control_exp.shape[0]) ] sPTD_neglog10pval = [ -math.log10(x) for x in ttest(Control_exp, sPTD_exp_sample, axis=1).pvalue ] PPROM_neglog10pval = [ -math.log10(x) for x in ttest(Control_exp, PPROM_exp_sample, axis=1).pvalue ] sPTDDiffGenes = runDiffGenes(esetSC2, maxGeneCount, 'Control_vs_sPTD', sPTD_log2FC, sPTD_neglog10pval) PPROMDiffGenes = runDiffGenes(esetSC2, maxGeneCount, 'Control_vs_PPROM', PPROM_log2FC, PPROM_neglog10pval) return sPTDDiffGenes, PPROMDiffGenes
def run(self, num_iters: int): """ :param num_iters: Number of iterations we want to run the experiment :return: A dictionary with t-test results on various metrics between the random graphs with and without sex-ed """ without_taus, without_ginis, without_freemans, without_mgs = [], [], [], [] with_taus, with_ginis, with_freemans, with_mgs = [], [], [], [] for _ in range(num_iters): result = self.__run_one_iteration() without_taus.append(result['without']['tau']) without_ginis.append(result['without']['gini']) without_freemans.append(result['without']['freeman']) without_mgs.append(result['without']['mg']) with_taus.append(result['with']['tau']) with_ginis.append(result['with']['gini']) with_freemans.append(result['with']['freeman']) with_mgs.append(result['with']['mg']) ttest_results_tau = ttest(without_taus, with_taus) ttest_results_gini = ttest(without_ginis, with_ginis) ttest_results_freeman = ttest(without_freemans, with_freemans) ttest_results_mgs = ttest(without_mgs, with_mgs) return { 'tau': ttest_results_tau, 'gini': ttest_results_gini, 'freeman': ttest_results_freeman, 'mgs': ttest_results_mgs }
def tt(A, B): f_p = f(A, B).pvalue if f_p <= 0.05: t_p = ttest(A, B, equal_var=False).pvalue elif f_p > 0.05: t_p = ttest(A, B, equal_var=True).pvalue return t_p
def surface_glm_data(df, marker='coef_', output='t_stat'): ''' Input: beta-weights averaged per label. 1. Average across sessions 2. Average / Subtract for mean response, lateral response, etc... 3. ttest across subjects - Arguments: a) concatenated beta-weight data in a pd.DataFrame with the columns subject, session, parameter, hemisphere, cortical label, value b) parameters that shall be lateralized (e.g. response) c) columns name of beta-weight values (normally 'coef_') d) output t-statistic or p-value ''' if output == 't_stat': p_or_t = 0 elif output == 'p_val': p_or_t = 1 ses_mean = df.groupby(['subject', 'parameter', 'names', 'hemisphere']).\ mean().reset_index() # !! average across sessions per subject mean_response = ses_mean.loc[ses_mean.parameter. isin(['response_left_rule_resp_A', 'response_left_rule_resp_B', 'response_right_rule_resp_A', 'response_right_rule_resp_B'])].\ groupby(['subject', 'names', 'hemisphere']).mean().reset_index() # response average mean_response['parameter'] = 'response_average' response_left_minus_right = ses_mean.loc[ses_mean.parameter.isin (['response_left_rule_resp_A', 'response_left_rule_resp_B'])].\ groupby(['subject', 'names', 'hemisphere']).mean() -\ ses_mean.loc[ses_mean.parameter.isin(['response_right_rule_resp_A', 'response_right_rule_resp_B'])].\ groupby(['subject', 'names', 'hemisphere']).mean() response_left_minus_right = response_left_minus_right.reset_index() response_right_minus_left = response_left_minus_right.copy() response_right_minus_left.coef_ = -response_right_minus_left.coef_ response_left_minus_right['parameter'] = 'response_left-right' response_right_minus_left['parameter'] = 'response_right-left' # response subtractions ses_mean = pd.concat([ses_mean, mean_response, response_left_minus_right, response_right_minus_left], sort=False) ses_mean.to_hdf('/Users/kenohagena/Desktop/test.hdf', key='test') difference = (ses_mean.loc[ses_mean.hemisphere == 'R'].set_index(['parameter', 'names', 'subject']).drop('hemisphere', axis=1) - ses_mean.loc[ses_mean.hemisphere == 'L'].set_index(['parameter', 'names', 'subject']).drop('hemisphere', axis=1)).reset_index() difference = difference.groupby(['parameter', 'names']).\ agg(lambda x: ttest(x, 0)[p_or_t]).reset_index() mag = ses_mean.groupby(['parameter', 'names', 'hemisphere']).\ agg(lambda x: ttest(x, 0)[p_or_t]).reset_index() # !! t-test across across subjects average = mag.groupby(['parameter', 'names']).mean().reset_index() # !! average across hemispheres average = average.pivot(columns='parameter', index='names', values=marker) left_H = mag.loc[mag.hemisphere == 'L'].\ pivot(columns='parameter', index='names', values=marker) right_H = mag.loc[mag.hemisphere == 'R'].\ pivot(columns='parameter', index='names', values=marker) difference = difference.pivot(columns = 'parameter', index = 'names', values=marker) return average, left_H, right_H, difference
def surface_plot_data(grouped_df, lateral_params, marker='coef_'): ''' 1. Average across sessions 2a. Ttest per parameter, roi & hemisphere 2b. Average across hemispheres --> magnitude values of coef_ per ROI 3a. difference between hemispheres (kontra - ipsi) 3b. average across conditions (response left & right) 3c. ttest across subjects. --> lateralization values of coef_ for response / rresp ''' df = grouped_df ses_mean = df.groupby(['subject', 'parameter', 'names', 'hemisphere']).mean().reset_index() mag = ses_mean.groupby(['parameter', 'names', 'hemisphere' ]).agg(lambda x: ttest(x, 0)[1]).reset_index() mag = mag.groupby(['parameter', 'names']).mean().reset_index() mag = mag.pivot(columns='parameter', index='names', values=marker) for lateral_param in lateral_params: lat = ses_mean.loc[ses_mean.parameter.isin([ '{}_left'.format(lateral_param), '{}_right'.format(lateral_param) ])] lat.set_index(['subject', 'parameter', 'names', 'hemisphere', 'labs'], inplace=True) lat = lat.groupby(['parameter'], group_keys=False).apply(lateralize).reset_index() lat = lat.groupby(['names', 'subject']).mean().reset_index() lat = lat.groupby(['names' ]).agg(lambda x: ttest(x, 0)[1]).reset_index() mag['{}_lat'.format(lateral_param)] = lat[marker].values return mag
def kmeans(self,unscaled_data,k,random_state=42): """ Get feature mean and feature description for each cluster Params: unscaled_data: pandas dataframe of shape (n_samples, n_features) for data with outlier removed and not scaled k: number of clusters random_state: random_state of kmeans Returns: num_out, sum2 """ kmean = KMeans(n_clusters=k, random_state=random_state) k_cluster = kmean.fit_predict(self.data) unscaled_data['Cluster'] = k_cluster num_out = unscaled_data.groupby('Cluster').mean() sum2 = pd.DataFrame(np.zeros(unscaled_data.groupby('Cluster').mean().shape),columns = unscaled_data.groupby('Cluster').mean().columns, index = unscaled_data.groupby('Cluster').mean().index) for i in range(unscaled_data.groupby('Cluster').mean().shape[0]): for j in range(unscaled_data.groupby('Cluster').mean().shape[1]): var = unscaled_data.columns[j] if (ttest(unscaled_data.loc[unscaled_data['Cluster']==i,var],unscaled_data.loc[:,var],equal_var=False)[1] <0.001) and (unscaled_data.loc[unscaled_data['Cluster']==i,var].mean() > unscaled_data.loc[:,var].mean()): sum2.iloc[i,j] = 'High' elif (ttest(unscaled_data.loc[unscaled_data['Cluster']==i,var],unscaled_data.loc[:,var],equal_var=False)[1] <0.001) and (unscaled_data.loc[unscaled_data['Cluster']==i,var].mean() < unscaled_data.loc[:,var].mean()): sum2.iloc[i,j] = 'Low' else: sum2.iloc[i,j] = 'Average' return num_out,sum2
def run_ttest(calls, prty): call_list = _filter_priority(calls, prty) return pd.Series([ ttest(count_it(call_list, 2015, prty), count_it(call_list, 2016, prty))[1], ttest(count_it(call_list, 2016, prty), count_it(call_list, 2017, prty))[1] ], index=[2016, 2017], name=prty)
def experiment(true_exp_mean:float, true_control_mean:float, inter_day_SD:float, intra_day_SD:float, N_clusters:int, N_per_cluster:int, data_method:str = 'pool', ttest_method:bool = True, show_figure:bool = False): """ This module generates data and does the processing There are several types of processing By default it is use simple t-test on pooled data (ignore clustering) INPUT: 1) the parameters for data generating 2) data_method = {‘pool’, ‘cluster_means’}, optional choose the type of data to process furter ( if 'pool', use the pooled data elif 'cluster_means' use the means of clusters ) 2) ttest_method: bool, optional choose what type of ttest to apply ( if True, use simple t-test else use the adjusted t-test ) 3) figure_show: bool, optional decide if you want to see the figure of your data by default it's off OUTPUT: hypothesis and p-value of experiment result EXAMPLE_OF_USE: experiment(1, 1, 0.1, 0.2, 3, 5) """ # generate a matrix of data data_exp = generate_data(true_exp_mean, inter_day_SD, intra_day_SD, N_clusters, N_per_cluster) data_control = generate_data(true_control_mean, inter_day_SD, intra_day_SD, N_clusters, N_per_cluster) # do the processing #process_data() FIXME #ipdb.set_trace() if data_method == 'pool': # use pooled data for processing data_exp_pooled = data_exp.reshape(-1).tolist() # pool the data into a list data_control_pooled = data_control.reshape(-1).tolist() #print(data_exp, data_control) if ttest_method: t, p_value = ttest(data_exp_pooled, data_control_pooled) # use simple t-test else: # use adjusted t-test t, p_value = adj_ttest(N_per_cluster, N_clusters, inter_day_SD, intra_day_SD, data_exp_pooled, data_control_pooled) elif data_method == 'cluster_means':# use means of clusters for processing data_exp_mean = data_exp.mean(axis=0) data_control_mean = data_control.mean(axis=0) if ttest_method: t, p_value = ttest(data_exp_mean, data_control_mean) # calculate t-test and check a hypothesis else: print('can\'t do adjusted t-test on means of clusters. Need pooled data') return # display data if show_figure: display_data(data_exp, data_control, N_clusters, N_per_cluster) return t, p_value
def make_comparisons(df, fi, cond_di, out_path): ''' Open comparison file, where each line is the comparison to do, for example, to compare treatmentA to control, the comparison file line would be: treatmentA control Comparison divides avgerage treatmentA value by avgerage control value ''' with open(fi, 'r') as f: li = [l for l in f.read().split('\n') if l] tdf = pd.DataFrame() for comp in li: cdf = pd.DataFrame() n, d = comp.split(',') name = 'vs'.join([n, d]) # Calculate fold change cdf['foldChange'] = df['AVG_{}'.format(n)] / df['AVG_{}'.format(d)] # Run T-Test to get p-value # - will spit errors if all zeros for expression, but can ignore cdf['pValue'.format(name)] = ttest(df[cond_di[n]], df[cond_di[d]], axis=1, equal_var=True)[1] cdf = cdf.replace([np.inf, np.nan], 'NA') tdf = pd.concat([tdf, cdf], axis=1) cdf['AVG_{}'.format(n)] = df['AVG_{}'.format(n)] cdf['AVG_{}'.format(d)] = df['AVG_{}'.format(d)] cdf = cdf.sort_values(by=['pValue'.format(name)], ascending=[True]) cdf.to_csv(path_or_buf='{}/{}.csv'.format(out_path, name), sep=',') df = pd.concat([tdf, df], axis=1) return df
def get_pval(scores1, scores2): scores_joined = scores1.join(scores2, how='inner', lsuffix='_1', rsuffix='_2') ttest_results = ttest(scores_joined['score_1'], scores_joined['score_2']) pval = ttest_results[1] if np.isnan(pval): pval = 0 return pval
def post(self): """ Analyses the results from a data dissemination quiz and gives the result. The result can be either that stereotypes were found, or that they weren't found. """ validators = { "data": valid.validate_dissemination_answers, } data = valid.validate(valid.read_form_data(request), validators) if not data: return ANSWERS[400], 400 data = valid.read_form_data(request) question3, block_3_answers = self.get_block_information(2, data) question5, block_5_answers = self.get_block_information(4, data) t_statistic, p_value = ttest(block_3_answers, block_5_answers, equal_var=False) response = DISSEMINATION_NO_ASSOCIATION if p_value <= 0.1: if t_statistic < 0: response = DISSEMINATION_RESULT_MALE else: response = DISSEMINATION_RESULT_FEMALE if 'email' in data and valid.validate_email(data['email']): self.send_email(res=response, email=data['email']) return response, 200
def summarize(self, question_scores=None, summary_stats=('mean', )): if not question_scores: question_scores = self.question_scores summary = defaultdict(lambda: defaultdict(lambda: defaultdict())) for summary_stat in summary_stats: print '=========== %s ===========' % summary_stat for question, agent_scores in question_scores.iteritems(): if self.question_type( question) == 'str' or question not in self.questions: continue for agent, scores in agent_scores.iteritems(): print agent, np.histogram([x[2] for x in scores], bins=5)[0] results = [(agent, self.summarize_scores(scores, summary_stat), self.get_total(scores)) for agent, scores in agent_scores.iteritems()] results = sorted(results, key=lambda x: x[1][0], reverse=True) agent_ratings = {} for i, (agent, stat, total) in enumerate(results): agent_ratings[agent] = stat[1] summary[question][agent]['score'] = stat[0] summary[question][agent]['sem'] = sem( stat[1]) if len(stat[1]) > 1 else 0 summary[question][agent]['total'] = total summary[question][agent]['ttest'] = '' # T-test agents = self.agents for i in range(len(agents)): for j in range(i + 1, len(agents)): try: result = ttest(agent_ratings[agents[i]], agent_ratings[agents[j]]) except KeyError: continue #print agents[i], agents[j], result t, p = result if p < 0.05: if t > 0: win_agent, lose_agent = agents[i], agents[j] else: win_agent, lose_agent = agents[j], agents[i] summary[question][win_agent][ 'ttest'] += lose_agent[0] # Print for question, agent_stats in summary.iteritems(): print '============= %s ===============' % self.question_labels[ question] print '{:<12s} {:<10s} {:<10s} {:<10s} {:<10s}'.format( 'agent', 'avg_score', 'error', '#score', 'win') print '---------------------------------------' for i, agent in enumerate(agents): stats = agent_stats[agent] try: print '{:<12s} {:<10.1f} {:<10.2f} {:<10d} {:<10s}'.format( self.agent_labels[agent], stats['score'], stats['sem'], stats['total'], stats['ttest']) except KeyError: continue return summary
def second_level(first_level_tests): mu = np.zeros(shape=(len(list(first_level_tests)))) t, p = ttest(first_level_tests.values, popmean=mu, axis=0) res_df = pd.DataFrame(index=['t', 'p'], columns=list(first_level_tests)) res_df.loc['t'] = t res_df.loc['p'] = p return res_df
def test_ttest(nrep,nqs): pvec = [] g1 = np.round(5*np.random.random([nrep,nqs])) g2 = np.round(5*np.random.random([nrep,nqs])) for i in range(nqs): t,p = ttest(g1[:,i],g2[:,i]) pvec.append(p) return pvec
def eval_scores_vs(baseline_scores, model_scores, badness_threshold): ''' print and display compar\ison two sets of scores against each other ''' diffs = np.array(model_scores) - np.array(baseline_scores) print(np.mean(baseline_scores), np.mean(model_scores)) print("t-test", ttest(baseline_scores, model_scores)) print("z-score", np.mean(diffs) / np.std(diffs)) maxx = np.max(np.abs(diffs)) print("baseline below thresh", (np.array(baseline_scores) < badness_threshold).mean()) print("model below thresh", (np.array(model_scores) < badness_threshold).mean())
def eval_scores_vs(baseline_scores, model_scores, badness_threshold): diffs = np.array(model_scores) - np.array(baseline_scores) print(np.mean(baseline_scores), np.mean(model_scores)) print("t-test", ttest(baseline_scores, model_scores)) print("z-score", np.mean(diffs) / np.std(diffs)) maxx = np.max(np.abs(diffs)) print("baseline below thresh", (np.array(baseline_scores) < badness_threshold).mean()) print("model below thresh", (np.array(model_scores) < badness_threshold).mean())
def pulsevarUpdate(self, acc_vec): # --- Test if higher than accthr try: r = ttest(acc_vec, self.accthr) except ZeroDivisionError: if all(acc_vec) <= 0: return self.pulse_decrease elif all(acc_vec) >= 0: return self.pulse_increase else: return self.pulse_nochange if r[0] > 0 and r[1] < self.halfa: return self.pulse_increase # --- Test if lower than accthr r = ttest(acc_vec, max(0.01, self.accthr - 0.15)) if r[0] < 0 and r[1] < self.halfa: return self.pulse_decrease # --- Return default return self.pulse_nochange
def process_data(data_exp, data_control, N_per_cluster, N_clusters, \ inter_cluster_SD, intra_cluster_SD, data_method, ttest_method): """ This is the function to process data There are several types of processing By default it is use simple t-test on pooled data (ignore clustering) INPUT: 1) the parameters for data generating 2) data_method = {‘pool’, ‘cluster’}, optional choose the type of data to process furter ( if 'pool', use the pooled data elif 'cluster_means' use the means of clusters ) 3) ttest_method = {'simple', 'adjusted'}, optional choose what type of ttest to apply For more information read methods.md """ if data_method == 'pool': # use pooled data for processing # pool the data into a list: data_exp_pooled = data_exp.reshape(-1) data_control_pooled = data_control.reshape(-1) #print(data_exp, data_control) if ttest_method == 'simple': # use simple t-test t, p_value = ttest(data_exp_pooled, data_control_pooled) elif ttest_method == 'adjusted': # use adjusted t-test t, p_value = adj_ttest(N_per_cluster, N_clusters, inter_cluster_SD, \ intra_cluster_SD, data_exp_pooled, data_control_pooled) else: print('insert correct t-test method') elif data_method == 'cluster': # use means of clusters for processing data_exp_mean = data_exp.mean(axis=0) data_control_mean = data_control.mean(axis=0) if ttest_method == 'simple': t, p_value = ttest(data_exp_mean, data_control_mean) elif ttest_method == 'adjusted': print('can\'t do adjusted t-test. Need pooled data') return else: print('insert correct t-test method') return p_value
def run_second_level(group_df): mus = np.zeros(len(list(group_df))) tvals, pvals = ttest(proj_utils.clean_df_to_numpy(group_df), popmean=mus, axis=0) res_df = pd.DataFrame(index=['t_values', 'p_values'], columns=list(group_df)) res_df.iloc[0] = tvals del tvals res_df.iloc[1] = pvals del pvals return res_df
def main(): cv_scores=pd.read_csv('./aty_best_arch_cv.csv',index_col=0) cv_scores.columns=['model_name','cv_loss','cv_std'] best_assay=cv_scores[cv_scores['cv_loss']==cv_scores['cv_loss'].min()] equal_combinations=[] for index,combin in cv_scores.iterrows(): # print(combin) t,p=ttest(best_assay['cv_loss'],best_assay['cv_std'],10,combin['cv_loss'],combin['cv_std'],10) if p>=0.05: equal_combinations.append(combin) print(best_assay) print(pd.DataFrame(equal_combinations).sort_values(['cv_loss']))
def run_test(self, num_timesteps_back, alpha=0.05): results = [] for fn, data in self.stats.items(): try: adf_res = adf(data[-num_timesteps_back:])[1] < alpha except ValueError as e: adf_res = None try: ttest_res = ttest( data[int(-num_timesteps_back):int(-num_timesteps_back / 2)], data[int(-num_timesteps_back / 2):])[1] > alpha except ValueError as e: ttest_res = None results.append(adf_res and ttest_res) return np.all(results)
def pulsevarUpdate(self, acc_vec): # --- Test if mean(acc_vec) is equal to accthr try: r = ttest(acc_vec, self.accthr) except ZeroDivisionError: if all(acc_vec) <= 0 and self.allow_pdecr: return self.pulse_decrease elif all(acc_vec) >= 0 and self.allow_pincr: return self.pulse_increase else: return self.pulse_nochange # --- if r[1] >= self.halfa: return self.pulse_nochange if r[0] > 0 and self.allow_pincr: return self.pulse_increase if r[0] < 0 and self.allow_pdecr: return self.pulse_decrease # --- Return default return self.pulse_nochange
def H4_testing(exp_names, trials, load_dir): write_file = 'hLogs/H4.txt' mean_sims = [] std_sims = [] for name in exp_names: sims = [] for i in range(trials): print(i) load_file = load_dir + name + str(i) + '.pkl' with open(load_file, 'rb') as f: if 'ST' in name: pop, _, _, _, _, _, _, _ = pickle.load(f) else: pop, _, _, _, _, _, _ = pickle.load(f) sim = get_trainset_similarity(pop) sims.append(sim) mean_sim = np.mean(sims) mean_sims.append(mean_sim) std_sim = np.std(sims) std_sims.append(std_sim) if 'DIF_SF' in name: mean_sim_naive = mean_sim std_sim_naive = std_sim for i in range(len(exp_names)): name = exp_names[i] mean_sim = mean_sims[i] std_sim = std_sims[i] if 'DIF' in name: with open(write_file, 'a+') as f: f.write(name + '\n') f.write('Mean Hausdorffs: {} \n'.format(mean_sim)) f.write('STD Hausdorffs: {} \n'.format(std_sim)) f.write('\n') if 'DIT' in name: pct_improve = -(mean_sim - mean_sim_naive) / mean_sim_naive pval_naive = ttest(mean_sim, std_sim, trials, mean_sim_naive, std_sim_naive, trials) with open(write_file, 'a+') as f: f.write(name + '\n') f.write('Mean Hausdorffs: {} \n'.format(mean_sim)) f.write('STD Hausdorffs: {} \n'.format(std_sim)) f.write('Mean Percent Improvement: {} \n'.format(pct_improve)) f.write('Pval Naive: {} \n'.format(pval_naive)) f.write('\n')
def update_stats(x_val, y_val, n): confidence = [.9, .95, .97, .99] a = round(1 - confidence[n], 2) df = pd.read_csv("../data/style_plus_beer.csv") df_style_abv = df[['Super Style', 'abv']] df_x = df_style_abv[df_style_abv['Super Style'] == x_val] df_y = df_style_abv[df_style_abv['Super Style'] == y_val] results = ttest(np.array(df_x['abv'].dropna()), np.array(df_y['abv'].dropna()), equal_var=False) p_value = round(results[1], 4) if p_value > a: output_p = f"p = {p_value}, which is greater than our alpha value of {a}, therefore we fail to reject the null hypothesis. Then we are {round(confidence[n]*100,0)}% confident that there is no difference between the averge values of the 2 populations." else: output_p = f"p = {p_value}, which is less than our alpha value of {a}, therefore we reject the null hypothesis. Then we are {round(confidence[n]*100,0)}% confident that there is a statistical difference between the average values of the 2 populations." return output_p
def summarize(question_scores): summary = defaultdict(lambda: defaultdict(lambda: defaultdict())) for summary_stat in ('mean', ): #print '=========== %s ===========' % summary_stat for question, agent_scores in question_scores.iteritems(): if question == 'comments' or question.endswith('text'): continue results = [(agent, summarize_scores(scores, summary_stat), get_total(scores)) for agent, scores in agent_scores.iteritems()] results = sorted(results, key=lambda x: x[1][0], reverse=True) agent_ratings = {} for i, (agent, stat, total) in enumerate(results): agent_ratings[agent] = stat[1] summary[question][agent]['score'] = stat[0] summary[question][agent]['total'] = total summary[question][agent]['ttest'] = '' # T-test agents = ('human', 'rulebased', 'static-neural', 'dynamic-neural') for i in range(len(agents)): for j in range(i + 1, len(agents)): result = ttest(agent_ratings[agents[i]], agent_ratings[agents[j]]) #print agents[i], agents[j], result t, p = result if p < 0.05: if t > 0: win_agent, lose_agent = agents[i], agents[j] else: win_agent, lose_agent = agents[j], agents[i] summary[question][win_agent]['ttest'] += lose_agent[0] # Print agent_labels = ('Human', 'Rule-based', 'StanoNet', 'DynoNet') for question, agent_stats in summary.iteritems(): print '============= %s ===============' % question.upper() print '{:<12s} {:<10s} {:<10s} {:<10s}'.format( 'agent', 'avg_score', '#score', 'win') print '---------------------------------------' for i, agent in enumerate(agents): stats = agent_stats[agent] print '{:<12s} {:<10.1f} {:<10d} {:<10s}'.format( agent_labels[i], stats['score'], stats['total'], stats['ttest']) return summary
def main(): cv_scores=pd.read_csv('./aty_best_arch_cv.csv',index_col=0) cv_scores.columns=['model_name','cv_loss','cv_std'] ## This access the aty_best_arch_cv file and labels the three columns 'model_name', 'cv_loss' and 'cv_std' ## Then assigns this to a variable cv_score best_assay=cv_scores[cv_scores['cv_loss']==cv_scores['cv_loss'].min()] ## best_assay is a data frame object which takes the model with the least cv_loss ## equal_combinations is an empty list variable. equal_combinations=[] for index,combin in cv_scores.iterrows(): ## The above iterates through every row of the data frame and accesses each data row t,p=ttest(best_assay['cv_loss'],best_assay['cv_std'],10,combin['cv_loss'],combin['cv_std'],10) ## A t-test is performed against of each data row against the row with the smallest cv_loss value ## The t-test is performed assuming that both the data sets have equal variance i.e they passed the null hypothesis test (F-test) if p>=0.05: equal_combinations.append(combin) ## The data set is added to the empty list created if the data set is not significantly different from the best_assay as predicted by the ttest print(best_assay) print(pd.DataFrame(equal_combinations).sort_values(['cv_loss']))
def differential_analysis(test_norm, control_norm): ''' calculated fold change on log transformed data. ======== Paremeters: test_norm: pandas.dataframe, a dataframe of normalized test data, rows as cells and columns as features. control_norm: pandas.dataframe, a dataframe of normalized control data, rows as cells and columns as features. ''' report = pd.DataFrame( columns=['logFC', 'T_pValue', 'KS_pValue', 'adj_T_pVal', 'adj_KS_pVal']) for feature in control_norm.columns: fc = test_norm[feature].mean() - control_norm[feature].mean() pval = ttest(control_norm[feature], test_norm[feature]) ks_pval = ks_2samp(control_norm[feature], test_norm[feature])[1] report.loc[feature, 'logFC'] = np.round(fc, 2) report.loc[feature, 'T_pValue'] = pval[1] report.loc[feature, 'KS_pValue'] = ks_pval report['adj_T_pVal'] = fdr(report.T_pValue)[1] report['adj_KS_pVal'] = fdr(report.KS_pValue)[1] return report
def H1_testing(exp_names, trials, load_dir): write_file = 'hLogs/H1.txt' for name in exp_names: init_scores = [] final_scores = [] pct_improves = [] for i in range(trials): load_file = load_dir + name + str(i) + '.pkl' with open(load_file, 'rb') as f: if 'ST' in name: _, _, learn_curve, _, _, _, _, _ = pickle.load(f) else: _, learn_curve, _, _, _, _, _ = pickle.load(f) init_score = learn_curve[0] final_score = learn_curve[-1] diff_score = final_score - init_score pct_improve = diff_score / init_score init_scores.append(init_score) final_scores.append(final_score) pct_improves.append(pct_improve) mean_init = np.mean(init_scores) std_init = np.std(init_scores) mean_final = np.mean(final_scores) std_final = np.std(final_scores) mean_pct_improve = -np.mean(pct_improve) std_pct_improve = np.std(pct_improve) p_diff = ttest(mean_final, std_final, trials, mean_init, std_init, trials) with open(write_file, 'a+') as f: f.write(name + '\n') f.write('Mean Initial Score: {} \n'.format(mean_init)) f.write('STD Initial Score: {} \n'.format(std_init)) f.write('Mean Final Score: {} \n'.format(mean_final)) f.write('STD Final Score: {} \n'.format(std_final)) f.write('Mean Percent Improvement: {} \n'.format(mean_pct_improve)) f.write('STD Percent Improvement: {} \n'.format(std_pct_improve)) f.write('Pval Inital-Final Difference: {} \n'.format(p_diff)) f.write('\n')
def postPreTTest(self,df,isTreatmentGroup=1,equal_var=True,groupbyidentifier=True): """ Run a T-test on the pre group versus the post group, grouped by the identifier and taking the mean. We are comparing the means of each identifier before and after the test. """ postTreatmentDataFrame, preTreatmentDataFrame = self._ttest_with_identifier_aggregation_setup(df, isTreatmentGroup) if groupbyidentifier: postTreatmentDataFrame = postTreatmentDataFrame.groupby(["identifier"]).mean() preTreatmentDataFrame = preTreatmentDataFrame.groupby(["identifier"]).mean() testStatistic, pValue = ttest( np.array(postTreatmentDataFrame["kpi"]), np.array(preTreatmentDataFrame["kpi"]), equal_var=equal_var ) estimate = np.mean(np.array(postTreatmentDataFrame["kpi"]))-np.mean(np.array(preTreatmentDataFrame["kpi"])) return {"test statistic": testStatistic ,"p-value": pValue ,"estimate": estimate}
def sampleDiff(sa, sb, fcmax, fcmin, size=1000): """ Sampling cell numbers to specific size and get the differential test results. @param sa: np.array @param sb: np.array @param size: int """ #sampling csa = list(range(len(sa))) csb = list(range(len(sb))) random.shuffle(csa) random.shuffle(csb) csa = csa[:size] csb = csb[:size] #estimating a = sa[csa] b = sb[csb] ea = a[a > 0] eb = b[b > 0] ea = float(len(ea)) / len(a) eb = float(len(eb)) / len(b) t, p = ttest(a, b) ma = np.mean(a) mb = np.mean(b) fc = np.log2(ma) - np.log2(mb) if fc > fcmax: fc = fcmax if fc < fcmin: fc = fcmin rs = { "fc": fc, "meanExp": ma, "expRatio": ea, "meanExpOthers": mb, "expRatioOthers": eb, "p-value": p, "expRatioDiff": ea - eb } rs = pd.Series(rs) return rs
def checkDiff(mata,matb,fout,fccut=1,pcut=1e-5,expr=0.2): """ @param mata: pd.DataFrame, matrix for cluster a @param matb: pd.DataFrame, matrix for other clusters @param fccut: float, log2 fold change cutoff @param pcut: float, t-test p-value cutoff @param expr: float, expressed cells """ s = mata.sum(axis=1) s = s[s>=s.median()] ns = s.index rs = {} ts = [] for n in tqdm(mata.index): a = mata.loc[n,] b = matb.loc[n,] ea = a[a>0] eb = b[b>0] ea = float(len(ea)) / len(a) eb = float(len(eb)) / len(b) t,p = ttest(a,b) ma = np.mean(a) mb = np.mean(b) fc = np.log2(ma)-np.log2(mb) rs[n] = {"fc":fc,"meanExp":ma,"expRatio":ea,"meanExpOthers":mb,"expRatioOthers":eb,"p-value":p,"expRatioDiff": ea-eb} if fc > fccut and p < pcut and ea >= expr and n in ns: rs[n]["sig"] = 1 ts.append( n ) else: rs[n]["sig"] = -1 rs = pd.DataFrame(rs).T rs = rs.fillna(0) rs.to_csv(fout+".txt",sep="\t") s = rs.loc[ts,"fc"] s = s.sort_values(ascending=False) print(s) with open(fout+".list","w") as fo: ns = [t.split("|")[0] for t in s.index] fo.write("\n".join(ns)) return s.index
def p_table(data, interest, reference, intcap='Conditions of interest', refcap='', caption='', label='', width=0.9, mode='rel',means_only=True): #make table with t-test p-values if mode == 'rel': from scipy.stats import ttest_rel as ttest #~ if mode == 'ind': #~ from scipy.stats import ttest_ind as ttest len_compare = len(reference)-1 len_interest = len(interest)-1 table_form = '{r|'+'Y|'*len_compare+'}' first_line = intcap+'& \\multicolumn{'+str(len_compare)+'}{c}{'+refcap+'}\\\\\n' second_line = '&'+'&'.join([str(i) for i in reference[1:]])+'\\\\\n'+'\\cline{2-'+str(len(reference))+'}\n' end_tabular = '\\end{tabularx}\n' caption = '\\caption{'+caption+'}\n' label = '\\label{'+label+'}\n' footer = '\\end{center}\n \\end{table}' latex = '\\begin{table}[!htbp]\n \\begin{center}\n \\begin{tabularx}{'+str(width)+'\\textwidth}'+table_form latex += first_line latex += second_line for i in interest[1:]: line = i for r in reference[1:]: cell = '&'+tex_nr(ttest(data[(data[interest[0]]==i)].groupby('ID')['RT'].mean(),data[(data[reference[0]]==r)].groupby('ID')['RT'].mean())[1]) line += cell line += '\\\\\n' latex += line latex += end_tabular latex += caption latex += label latex += footer return latex
#using model_comparison without genre as a feature recall, precision, f1_score = model_comparison(features, label_to_numbers(labels, dataset_class_histogram(dataset)), models, parameters_to_optimize, False) #printing results without genre as a parameter print ("without genre as a parameter \n Score \n") print np.average(f1_score, axis=0) if len(models) > 2: # print [f1_score[:,i].T for i in range(len(models))] print "Anova: ", f_oneway( *f1_score.T )[1] else: print "T-test:", ttest( f1_score[:,0].T, f1_score[:,1].T)[1] # #using model_comparison with genre as a feature # recall, precision, f1_score = model_comparison(features, label_to_numbers(labels, dataset_class_histogram(dataset)), models, parameters_to_optimize, True) # #printing results with genre as a parameter # print ("genre is a feature \n") # print np.average(f1_score, axis=0) # if len(models) > 2: # # print [f1_score[:,i].T for i in range(len(models))] # print "Anova: ", f_oneway( *f1_score.T )[1] # else:
burns_slice = burns_slice[['KEY', 'AGE', 'age_bins', 'DIED', 'DX_BURN']] burns_agerangegrouping = burns_slice[['AGE','age_bins','DIED']].dropna().sort('AGE').groupby('age_bins') burns_deathrates_proto1 = burns_agerangegrouping.aggregate([np.sum, 'count']) burns_deathrates_proto1['DIED'] #create death rate variable burns_deathrates_proto1['death_rate'] = burns_deathrates_proto1['DIED']['sum'] / burns_deathrates_proto1['DIED']['count'] #plot f, ax = plt.subplots(figsize=(6, 5)) agerange_mortality_plot = sns.barplot(x=burns_deathrates_proto1.index, y=burns_deathrates_proto1['death_rate'], color='maroon') agerange_mortality_plot.axes.set(title="Mortality rate by age range (CA only, burn victims only)", xlabel='Age ranges', ylabel='mortality rate') #do ttest on deathrate by age range stats.ttest(burns_deathrates_proto1[['death_rate']]) #or instead burns_deathrates_proto2 = pd.crosstab(burns_slice.age_bins, burns_slice.DIED) burns_deathrates_proto2['Total'] = burns_deathrates_proto2[0.0] + burns_deathrates_proto2[1.0] #plot sns.set(style="whitegrid") f, ax = plt.subplots(figsize=(6, 5)) sns.set_color_codes("pastel") sns.barplot(x=burns_deathrates_proto2['Total'], y=burns_deathrates_proto2.index, data=burns_deathrates_proto2, label="Total", color="b") sns.set_color_codes("muted") sns.barplot(x=burns_deathrates_proto2[1.0], y=burns_deathrates_proto2.index,
def main(): with open('/mnt/scratch/noa/pclproj/results/args_lab_data.json') as data_file: args = json.load(data_file) with open('/mnt/scratch/noa/pclproj/results/labels_objects_dict.json') as data_file: labels = json.load(data_file) #tot_acc = np.array([0.0, 0.0, 0.0]) tot_acc = [[], [], []] dates_list = ['1601', '1801', '2101', '2701', '0302', '0702', '1202'] for date_folder in dates_list: print date_folder true_labels_rgb, predictions_rgb = get_truth_predictions('/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/'+ date_folder+'/rgb/deploy.prototxt', '/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/' +date_folder+'/rgb/snapshots/_iter_200000.caffemodel', args["views_files"], args["coors_files"], '/home/noa/pcl_proj/experiments/mean_image_files/' +date_folder+'/mean_image_training_fifth_rgb.binaryproto', labels) true_labels_hist, predictions_hist = get_truth_predictions('/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/'+ date_folder+'/hist/deploy.prototxt', '/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/' +date_folder+'/hist/snapshots/_iter_200000.caffemodel', args["views_files"], args["coors_files"], '/home/noa/pcl_proj/experiments/mean_image_files/' +date_folder+'/mean_image_training_fifth_hist.binaryproto', labels) true_labels_rgb_hist, predictions_rgb_hist = get_truth_predictions('/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/'+ date_folder+'/rgb_hist/deploy.prototxt', '/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/' +date_folder+'/rgb_hist/snapshots/_iter_200000.caffemodel', args["views_files"], args["coors_files"], ['/home/noa/pcl_proj/experiments/mean_image_files/' +date_folder+'/mean_image_training_fifth_hist.binaryproto', '/home/noa/pcl_proj/experiments/mean_image_files/' +date_folder+'/mean_image_training_fifth_rgb.binaryproto'], labels) acc = [0.0, 0.0, 0.0] for i in range(len(predictions_rgb)): if true_labels_rgb[i] == predictions_rgb[i]: acc[0] += 1.0 if true_labels_hist[i] == predictions_hist[i]: acc[1] += 1.0 if true_labels_rgb_hist[i] == predictions_rgb_hist[i]: acc[2] += 1.0 tot_acc[0].append(acc[0] / len(predictions_rgb)) tot_acc[1].append(acc[1] / len(predictions_hist)) tot_acc[2].append(acc[2] / len(predictions_rgb_hist)) print acc print tot_acc print 'ttests: ' statistics_0, p_0 = ttest(np.array(tot_acc[0]), np.array(tot_acc[1])) sttistics_1, p_1 = ttest(np.array(tot_acc[1]), np.array(tot_acc[2])) sttistics_2, p_2 = ttest(np.array(tot_acc[0]), np.array(tot_acc[2])) print "final: " print 'rgb:' print np.array(tot_acc[0]).mean() print np.array(tot_acc[0]).std() print 'hist:' print np.array(tot_acc[1]).mean() print np.array(tot_acc[1]).std() print 'rgb_hist:' print np.array(tot_acc[2]).mean() print np.array(tot_acc[2]).std() print 'rgb vs. hist' print p_0 print 'hist vs. rgb_hist' print p_1 print 'rgb vs. rgb_hist' print p_2
def composite(hour): pool = multiprocessing.Pool(processes=5) file = cnst.MCS_POINTS_DOM #MCS_TMIN # path = cnst.network_data + '/figs/LSTA-bullshit/AGU' #corrected_LSTA/wavelet/large_scale hour = hour msg = xr.open_dataarray(file) msg = msg[(msg['time.hour'] == hour) & (msg['time.minute'] == 0) & ( msg['time.year'] >= 2006) & (msg['time.year'] <= 2010) & (msg['time.month'] >= 6) ] msg = msg.sel(lat=slice(10.2,19.3), lon=slice(-9.7, 9.7)) res = pool.map(file_loop, msg) pool.close() print('return parallel') # res = [] # for m in msg[0:30]: # out = file_loop(m) # res.append(out) res = [x for x in res if x is not None] snpos_list = [] wepos_list = [] rsnpos_list = [] rwepos_list = [] vkernel_list = [] rkernel_list = [] vkernel_cnt = [] rkernel_cnt = [] lsta_list = [] for r in res: snpos_list.append(np.squeeze(r[0])) wepos_list.append(np.squeeze(r[1])) rsnpos_list.append(np.squeeze(r[2])) rwepos_list.append(np.squeeze(r[3])) vkernel_list.append(r[4]) rkernel_list.append(r[5]) scales = r[6] vkernel_cnt.append(r[7]) rkernel_cnt.append(r[8]) lsta_list.append(r[9]) dic = collections.OrderedDict([('SN-pos' , [snpos_list, rsnpos_list]), ('WE-pos' , [wepos_list, rwepos_list]), ('kernel' , [vkernel_list, rkernel_list]), ('lsta' , [lsta_list]), ('cnt' , [vkernel_cnt, rkernel_cnt]), ('scales' , scales)]) keys = list(dic.keys()) for l in keys: if l == 'scales': continue (dic[l])[0] = np.squeeze(np.vstack((dic[l])[0])) try: (dic[l])[1] = np.squeeze(np.vstack((dic[l])[1])) except IndexError: continue dic['nbcores'] = dic['SN-pos'][0].shape[0] dic['nbrcores'] = dic['SN-pos'][1].shape[0] for l in keys: if (l == 'scales') | (l == 'lsta'): continue a = dic[l][0] b = dic[l][1] sa = a.shape sb = b.shape #if 'pos' in l: # a = (a.swapaxes(0,1).reshape(sa[1], sa[0]*sa[2]).T/np.nanstd(dic[l][0], axis=(0,2))).T.reshape(sa[1], sa[0],sa[2]).swapaxes(0,1) # b = (b.swapaxes(0,1).reshape(sb[1], sb[0]*sb[2]).T/np.nanstd(dic[l][1], axis=(0,2))).T.reshape(sb[1], sb[0], sb[2]).swapaxes(0,1) nsstat, nspvalue = ttest(a, b, axis=0, equal_var=False, nan_policy='omit') mask = nspvalue < 0.05 dic[l].append(mask) if 'pos' in l: dic[l].append(np.nanstd(dic[l][0], axis=(0,2))) dic[l].append(np.nanstd(dic[l][1], axis=(0,2))) for l in keys: if l == 'scales': continue if 'pos' in l: (dic[l])[0] = np.nanmean((dic[l])[0], axis=0) (dic[l])[1] = np.nanmean((dic[l])[1], axis=0) else: (dic[l])[0] = np.nansum((dic[l])[0], axis=0) try: (dic[l])[1] = np.nansum((dic[l])[1], axis=0) except IndexError: continue pkl.dump(dic, open(path+"/coeffs_test_nans_stdkernel"+str(hour)+"UTC.p", "wb")) print('Save file written!')
import sys from scipy.stats import ttest_ind as ttest a = [float(x) for x in open(sys.argv[1])] b = [float(x) for x in open(sys.argv[2])] print ttest(a, b)
def composite(hour): pool = multiprocessing.Pool(processes=5) file = constants.MCS_POINTS_DOM path = '/users/global/cornkle/figs/LSTA-bullshit/AGU' #corrected_LSTA/wavelet/large_scale hour = hour msg = xr.open_dataarray(file) msg = msg[(msg['time.hour'] == hour) & (msg['time.minute'] == 0) & ( msg['time.year'] >= 2006) & (msg['time.year'] <= 2010) & (msg['time.month'] >= 6) ] msg = msg.sel(lat=slice(10.2,18.5), lon=slice(-9.7, 9.7)) res = pool.map(file_loop, msg) pool.close() # for m in msg[2:5]: # file_loop(m) # return res = [x for x in res if x is not None] snpos_list_dry = [] wepos_list_dry = [] rsnpos_list_dry = [] rwepos_list_dry = [] snpos_list_wet = [] wepos_list_wet = [] rsnpos_list_wet = [] rwepos_list_wet = [] for r in res: snpos_list_dry.append(np.squeeze(r[0])) wepos_list_dry.append(np.squeeze(r[1])) rsnpos_list_dry.append(np.squeeze(r[2])) rwepos_list_dry.append(np.squeeze(r[3])) scales = r[4] snpos_list_wet.append(np.squeeze(r[5])) wepos_list_wet.append(np.squeeze(r[6])) rsnpos_list_wet.append(np.squeeze(r[7])) rwepos_list_wet.append(np.squeeze(r[8])) dic = collections.OrderedDict([('SN-pos' , [snpos_list_dry, rsnpos_list_dry]), ('WE-pos' , [wepos_list_dry, rwepos_list_dry]), ('SN-pos_wet', [snpos_list_wet, rsnpos_list_wet]), ('WE-pos_wet', [wepos_list_wet, rwepos_list_wet]), ('scales' , scales)]) keys = list(dic.keys()) for l in keys: if l == 'scales': continue (dic[l])[0] = np.squeeze(np.vstack((dic[l])[0])) (dic[l])[1] = np.squeeze(np.vstack((dic[l])[1])) for l in keys: if l == 'scales': continue nsstat, nspvalue = ttest(dic[l][0], dic[l][1], axis=0, equal_var=False, nan_policy='omit') mask = nspvalue < 0.05 dic[l].append(mask) nsstat, nspvalue = ttest(dic['SN-pos'][0], dic['SN-pos_wet'][0], axis=0, equal_var=False, nan_policy='omit') mask = nspvalue < 0.05 dic['SN-dw_mask'] = mask nsstat, nspvalue = ttest(dic['WE-pos'][0], dic['WE-pos_wet'][0], axis=0, equal_var=False, nan_policy='omit') mask = nspvalue < 0.05 dic['WE-dw_mask'] = mask for l in keys: if l == 'scales': continue (dic[l])[0] = np.nanmean((dic[l])[0], axis=0) (dic[l])[1] = np.nanmean((dic[l])[1], axis=0) pkl.dump(dic, open(path+"/test_wet_dry_withzero"+str(hour)+"UTC.p", "wb")) print('Save file written!')
def print_stats(): from scipy.stats import ttest_ind as ttest mask=get_mask() sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask) sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask) sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask) ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask) obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_dryspell.nc").data,mask=mask) print("AR dryspell= "+str(sar.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDd dryspell= "+str(sdd.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDm dryspell= "+str(sdm.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCCA dryspell= "+str( ca.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("Obs dryspell= "+str(obs.mean())[:5]) print("") sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask) sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask) sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask) ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask) obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_wetspell.nc").data,mask=mask) print("AR wetspell= "+str(sar.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDd wetspell= "+str(sdd.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDm wetspell= "+str(sdm.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCCA wetspell= "+str( ca.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("Obs wetspell= "+str(obs.mean())[:5]) print("") sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask) sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask) sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask) ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask) obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_wetfrac.nc").data,mask=mask) print("AR wetfrac= "+str(sar.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDd wetfrac= "+str(sdd.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDm wetfrac= "+str(sdm.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCCA wetfrac= "+str( ca.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("Obs wetfrac= "+str(obs.mean())[:5]) print("") sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask) sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask) sdd.mask[~np.isfinite(sdd)]=True sdd.mask[sdd>1e5]=True sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask) sdm.mask[~np.isfinite(sdm)]=True sdm.mask[sdm>1e5]=True ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask) obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_MAP.nc").data,mask=mask) print("AR MAP= "+str(sar.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDd MAP= "+str(sdd.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDm MAP= "+str(sdm.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCCA MAP= "+str( ca.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("Obs MAP= "+str(obs.mean())[:5]) print("") sar=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"SAR-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask) sdd=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"SD-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask) sdd.mask[~np.isfinite(sdd)]=True sdd.mask[sdd>1e5]=True sdm=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask) sdm.mask[~np.isfinite(sdm)]=True sdm.mask[sdm>1e5]=True ca=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"CA-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask) obs=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"obs-maurer.125-pr_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask) print("AR extremes_nday1= "+str(sar.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDd extremes_nday1= "+str(sdd.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCSDm extremes_nday1= "+str(sdm.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("BCCA extremes_nday1= "+str( ca.mean())[:5]) print(' pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0)) print("Obs extremes_nday1= "+str(obs.mean())[:5]) print("")
def ttest(self, keys, dv, paired=True, collapse=None): """ desc: Performs t-tests between groups defined by a list of keys. arguments: keys: desc: A list of keys to define the groups. type: list dv: desc: The dependent variable. type: [str, unicode] keywords: paired: desc: Determines whether a paired-samples t-test or an independent samples t-test should be conducted. type: bool collapse: desc: A key to collapse the data on, so that you can do t-tests on (subject) means. type: [str, unicode, NoneType] returns: desc: A list of (desc, t, p) tuples. type: list """ from itertools import combinations if paired: from scipy.stats import ttest_rel as ttest else: from scipy.stats import ttest_ind as ttest if collapse != None: dm = self.collapse(collapse + keys, dv) dv = 'mean' else: dm = self _l = [['group', 'N', 'M / t', 'SE / p']] lDm = dm.group(keys) for l in combinations(lDm, 2): group0 = '' for key in keys: group0 += str(l[0][key][0]) + '_' group0 = group0[:-1] group1 = '' for key in keys: group1 += str(l[1][key][0]) + '_' group1 = group1[:-1] N0 = len(l[0]) M0 = l[0][dv].mean() SE0 = l[0][dv].std() / np.sqrt(len(l[0])) _l.append( [group0, N0, M0, SE0] ) N1 = len(l[1]) M1 = l[1][dv].mean() SE1 = l[1][dv].std() / np.sqrt(len(l[1])) _l.append( [group1, N1, M1, SE1] ) t, p = ttest(l[0][dv], l[1][dv]) _l.append( [group0, group1, t, p] ) return DataMatrix(np.array(_l))