def simulate(agent, env, start=None, episodes=1_000_000): """ Simulate an agent in an environment over a number of episodes. Args: agent (object): an agent with an act(obs, reward, done) method. env (object): an OpenAI Gym environment. episodes (int): the number of episodes to simulate. Defaults to 1,000,000. Returns: A DescrStatsW object with the episodic reward distribution. Notes: A progress bar is shown. """ hist = defaultdict(int) for _ in tqdm(range(episodes)): total = 0. obs = env.reset() if start is None else env.explore(start) reward, done = 0., False while True: action = agent.act(obs, reward, done) obs, reward, done, _ = env.step(action) total += reward if done: hist[total] += 1 break stats = ssw.DescrStatsW( data=list(hist.keys()), weights=list(hist.values()) ) return stats
def t_test_two_samp(a, b, alpha, alternative='two-sided'): diff = a.mean() - b.mean() res = ss.ttest_ind(a, b) means = ws.CompareMeans(ws.DescrStatsW(a), ws.DescrStatsW(b)) confint = means.tconfint_diff(alpha=alpha, alternative=alternative, usevar='unequal') degfree = means.dof_satt() index = [ 'DegFreedom', 'Difference', 'Statistic', 'PValue', 'Low95CI', 'High95CI' ] return pd.Series([degfree, diff, res[0], res[1], confint[0], confint[1]], index=index)
def test_ttest(): x1, x2 = clinic[:15, 2], clinic[15:, 2] all_tests = [] t1 = smws.ttest_ind(x1, x2, alternative='larger', usevar='unequal') all_tests.append((t1, ttest_clinic_indep_1_g)) t2 = smws.ttest_ind(x1, x2, alternative='smaller', usevar='unequal') all_tests.append((t2, ttest_clinic_indep_1_l)) t3 = smws.ttest_ind(x1, x2, alternative='smaller', usevar='unequal', value=1) all_tests.append((t3, ttest_clinic_indep_1_l_mu)) for res1, res2 in all_tests: assert_almost_equal(res1[0], res2.statistic, decimal=13) assert_almost_equal(res1[1], res2.p_value, decimal=13) #assert_almost_equal(res1[2], res2.df, decimal=13) cm = smws.CompareMeans(smws.DescrStatsW(x1), smws.DescrStatsW(x2)) ci = cm.tconfint_diff(alternative='two-sided', usevar='unequal') assert_almost_equal(ci, ttest_clinic_indep_1_two_mu.conf_int, decimal=13) ci = cm.tconfint_diff(alternative='two-sided', usevar='pooled') assert_almost_equal(ci, ttest_clinic_indep_1_two_mu_pooled.conf_int, decimal=13) ci = cm.tconfint_diff(alternative='smaller', usevar='unequal') assert_almost_equal_inf(ci, ttest_clinic_indep_1_l.conf_int, decimal=13) ci = cm.tconfint_diff(alternative='larger', usevar='unequal') assert_almost_equal_inf(ci, ttest_clinic_indep_1_g.conf_int, decimal=13) #test get_compare cm = smws.CompareMeans(smws.DescrStatsW(x1), smws.DescrStatsW(x2)) cm1 = cm.d1.get_compare(cm.d2) cm2 = cm.d1.get_compare(x2) cm3 = cm.d1.get_compare(np.hstack((x2, x2))) #all use the same d1, no copying assert_(cm.d1 is cm1.d1) assert_(cm.d1 is cm2.d1) assert_(cm.d1 is cm3.d1)
def setup_class(cls): cls.res2 = tost_clinic_paired_1 x1, x2 = clinic[:15, 2], clinic[15:, 2] cls.res1 = Holder() res = smws.ttost_paired(x1, x2, -0.6, 0.6, transform=None) cls.res1.pvalue = res[0] #cls.res1.df = res[1][-1] not yet res_ds = smws.DescrStatsW(x1 - x2, weights=None, ddof=0) #tost confint 2*alpha TODO: check again cls.res1.tconfint_diff = res_ds.tconfint_mean(0.1) cls.res1.confint_05 = res_ds.tconfint_mean(0.05) cls.res1.mean_diff = res_ds.mean cls.res1.std_mean_diff = res_ds.std_mean cls.res2b = ttest_clinic_paired_1
def __init__(self): self.res2 = tost_clinic_paired_1 x1, x2 = clinic[:15, 2], clinic[15:, 2] self.res1 = Holder() res = smws.tost_paired(x1, x2, -0.6, 0.6, transform=None) self.res1.pvalue = res[0] #self.res1.df = res[1][-1] not yet res_ds = smws.DescrStatsW(x1 - x2, weights=None, ddof=0) #tost confint 2*alpha TODO: check again self.res1.confint_diff = res_ds.confint_mean(0.1) self.res1.confint_05 = res_ds.confint_mean(0.05) self.res1.mean_diff = res_ds.mean self.res1.std_mean_diff = res_ds.std_mean self.res2b = ttest_clinic_paired_1
def play(env, episodes=100, hint=False): """ Play an interactive session of blackjack. Args: env (object): an OpenAI Gym blackjack environment. Defaults to 'Blackjack-v1'. episodes (int): the number of episodes to play. Defaults to 100. hint (bool): whether to show the Basic Strategy (Thorp, 1966) as a hint. Defaults to False. Returns: A DescrStatsW object with the episodic reward distribution. Notes: The user can input either 0/1 or s/h (for stand/hit) as actions. If no valid input is entered, the default action is either 0 (if hint=False) or the basic strategy (if hint=True). """ if hint: agent = BasicStrategyAgent(env) hist = defaultdict(int) for _ in range(episodes): total = 0. obs, reward, done = env.reset(), 0., False while True: print(env.render(), end=' ') if hint: a0 = agent.act(obs, reward, done) k = input(f'action: [hint: {action_labels[a0].lower()}] ') else: a0 = 0 k = input(f'action: ') try: a = Action(int(k)) except ValueError: try: a = Action[action_labels.index(k.upper())] except KeyError: a = a0 obs, reward, done, _ = env.step(a) total += reward if done: hist[total] += 1 print(env.render()) break stats = ssw.DescrStatsW(data=list(hist.keys()), weights=list(hist.values())) return stats
'rsq': new_rsq[indices4plot] }) # sort values by eccentricity df = df.sort_values(by=['ecc']) bin_size = int(len(df) / n_bins) #divide in equally sized bins mean_ecc = [] mean_ecc_std = [] mean_size = [] mean_size_std = [] for j in range( n_bins ): # for each bin calculate rsq-weighted means and errors of binned ecc/size mean_size.append( weightstats.DescrStatsW( df[bin_size * j:bin_size * (j + 1)]['size'], weights=df[bin_size * j:bin_size * (j + 1)]['rsq']).mean) mean_size_std.append( weightstats.DescrStatsW( df[bin_size * j:bin_size * (j + 1)]['size'], weights=df[bin_size * j:bin_size * (j + 1)]['rsq']).std_mean) mean_ecc.append( weightstats.DescrStatsW(df[bin_size * j:bin_size * (j + 1)]['ecc'], weights=df[bin_size * j:bin_size * (j + 1)]['rsq']).mean) mean_ecc_std.append( weightstats.DescrStatsW(df[bin_size * j:bin_size * (j + 1)]['ecc'], weights=df[bin_size * j:bin_size * (j + 1)]['rsq']).std_mean) if idx == 0:
# specifying the timing of fMRI frames frame_times = TR * (np.arange(data.shape[-1])) # Create the design matrix, hrf model containing Glover model design_matrix = make_first_level_design_matrix(frame_times, events=events_avg, hrf_model='glover') regressors = np.array(design_matrix.keys()).astype(str) # make and save average beta weights for different ROIs for idx, roi in enumerate(ROIs): # plot beta weights for single voxel fig, axis = plt.subplots(1, figsize=(25, 7.5), dpi=100) # get the average values # weight voxels based on rsq of fit beta_avg = weightstats.DescrStatsW(betas[roi_verts[roi]], weights=rsq[roi_verts[roi]]).mean # need to do this to get weighted standard deviations of the mean for each regressor beta_std = [] for w in range(len(regressors)): beta_pred = np.array([ betas[roi_verts[roi]][x][w] for x in range(len(betas[roi_verts[roi]])) ]) beta_std.append( weightstats.DescrStatsW(beta_pred, weights=rsq[roi_verts[roi]]).std_mean) y_pos = np.arange(len(regressors)) plt.bar(y_pos, beta_avg, yerr=np.array(beta_std), align='center') plt.xticks(y_pos, regressors)
def test_mv_mean(): # names = ['id', 'mpg1', 'mpg2', 'add'] x = np.asarray([[1.0, 24.0, 23.5, 1.0], [2.0, 25.0, 24.5, 1.0], [3.0, 21.0, 20.5, 1.0], [4.0, 22.0, 20.5, 1.0], [5.0, 23.0, 22.5, 1.0], [6.0, 18.0, 16.5, 1.0], [7.0, 17.0, 16.5, 1.0], [8.0, 28.0, 27.5, 1.0], [9.0, 24.0, 23.5, 1.0], [10.0, 27.0, 25.5, 1.0], [11.0, 21.0, 20.5, 1.0], [12.0, 23.0, 22.5, 1.0], [1.0, 20.0, 19.0, 0.0], [2.0, 23.0, 22.0, 0.0], [3.0, 21.0, 20.0, 0.0], [4.0, 25.0, 24.0, 0.0], [5.0, 18.0, 17.0, 0.0], [6.0, 17.0, 16.0, 0.0], [7.0, 18.0, 17.0, 0.0], [8.0, 24.0, 23.0, 0.0], [9.0, 20.0, 19.0, 0.0], [10.0, 24.0, 22.0, 0.0], [11.0, 23.0, 22.0, 0.0], [12.0, 19.0, 18.0, 0.0]]) res = smmv.test_mvmean(x[:, 1:3], [21, 21]) res_stata = Holder(p_F=1.25062334808e-09, df_r=22, df_m=2, F=59.91609589041116, T2=125.2791095890415) assert_allclose(res.statistic, res_stata.F, rtol=1e-10) assert_allclose(res.pvalue, res_stata.p_F, rtol=1e-10) assert_allclose(res.t2, res_stata.T2, rtol=1e-10) assert_equal(res.df, [res_stata.df_m, res_stata.df_r]) # diff of paired sample mask = x[:, -1] == 1 x1 = x[mask, 1:3] x0 = x[~mask, 1:3] res_p = smmv.test_mvmean(x1 - x0, [0, 0]) # result Stata hotelling res_stata = Holder( T2=9.698067632850247, df=10, k=2, N=12, F=4.4082126, # not in return List p_F=0.0424) # not in return List res = res_p assert_allclose(res.statistic, res_stata.F, atol=5e-7) assert_allclose(res.pvalue, res_stata.p_F, atol=5e-4) assert_allclose(res.t2, res_stata.T2, rtol=1e-10) assert_equal(res.df, [res_stata.k, res_stata.df]) # mvtest means diff1 diff2, zero res_stata = Holder(p_F=.0423949782937231, df_r=10, df_m=2, F=4.408212560386478, T2=9.69806763285025) assert_allclose(res.statistic, res_stata.F, rtol=1e-12) assert_allclose(res.pvalue, res_stata.p_F, rtol=1e-12) assert_allclose(res.t2, res_stata.T2, rtol=1e-12) assert_equal(res.df, [res_stata.df_m, res_stata.df_r]) dw = weightstats.DescrStatsW(x) ci0 = dw.tconfint_mean(alpha=0.05) nobs = len(x[:, 1:]) ci1 = confint_mvmean_fromstats(dw.mean, np.diag(dw.var), nobs, lin_transf=np.eye(4), alpha=0.05) ci2 = confint_mvmean_fromstats(dw.mean, dw.cov, nobs, lin_transf=np.eye(4), alpha=0.05) assert_allclose(ci1[:2], ci0, rtol=1e-13) assert_allclose(ci2[:2], ci0, rtol=1e-13) # test from data res = smmv.confint_mvmean(x, lin_transf=np.eye(4), alpha=0.05) assert_allclose(res, ci2, rtol=1e-13)
def RunSimulation2(weekday="Monday", weekend="Saturday"): # Initialise list of optimal solution routes weekRoutes = pd.read_csv("Data" + sep + "Routes" + sep + "optimalRoutes" + weekday + ".csv", converters = {"Route": literal_eval}) wkndRoutes = pd.read_csv("Data" + sep + "Routes" + sep + "optimalRoutes" + weekend + ".csv", converters = {"Route": literal_eval}) # Get cleaned dataframe demandData = pd.read_csv("Data" + sep + "demandData.csv") locationData = pd.read_csv("Data" + sep + "FoodstuffLocations.csv") demandData = clean_data(demandData,locationData) locationData.loc[locationData["Supermarket"] == "Fresh Collective Alberton", "Type"] = "Four Square" [a,b,c] = setupbootstrap(demandData,wknd=False) [d,e,f] = setupbootstrap(demandData,wknd=True) # 1000 runs for monte carlo simulation for weekday wopt = [] opt = [] for i in range(1000): # Import travel times between supermarkets + demand predictions per store opt.append(simulate(weekRoutes, locationData, a, b, c, wknd=False)) wopt.append(simulate(wkndRoutes, locationData, d, e, f, wknd=True)) # Seaborn plot ax = sns.distplot(wopt, bins=100) ax.set_title("Weekend Optimal Solution") ax.set_xlabel("Optimal Solution Value ($)") ax.set_ylabel("Probablity") plt.savefig("Pictures/wopt.png") plt.show() ax = sns.distplot(opt, bins=100) ax.set_title("Weekday Optimal Solution") ax.set_xlabel("Optimal Solution Value ($)") ax.set_ylabel("Probablity") plt.savefig("Pictures/opt.png") plt.show() # Calculate confidence intervals optLower, optUpper = sms.DescrStatsW(opt).tconfint_mean(alpha = 0.05) woptLower, woptUpper = sms.DescrStatsW(wopt).tconfint_mean(alpha = 0.05) print("Mean and CI of Weekday") print(statistics.mean(opt)) print(optLower, optUpper) print("Mean and CI of Weekend") print(statistics.mean(wopt)) print(woptLower, woptUpper) # Sorting and getting prediction intervals for the optimal solution opt.sort() wopt.sort() # Histograms for optimal solutions, on weekday and weekend f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,10)) ax1.hist(opt, density=True, bins = 100, histtype='stepfilled', alpha=0.2) ax1.axvline(x=optLower, color='r', linewidth=2) ax1.axvline(x=optUpper, color='r', linewidth=2) ax1.axvline(x=opt[25], color='b', linewidth=2) ax1.axvline(x=opt[975], color='b', linewidth=2) ax1.set_title("Weekday Optimal Solution") ax1.set_xlabel("Optimal Solution Value ($)") ax1.set_ylabel("Probablity") ax2.hist(wopt, density=True, bins = 100, histtype='stepfilled', alpha=0.2) ax2.axvline(x=woptLower, color='r', linewidth=2) ax2.axvline(x=woptUpper, color='r', linewidth=2) ax2.axvline(x=wopt[25], color='b', linewidth=2) ax2.axvline(x=wopt[975], color='b', linewidth=2) ax2.set_title("Weekend Optimal Solution") ax2.set_xlabel("Optimal Solution Value ($)") ax2.set_ylabel("Probablity") plt.savefig("Pictures/Simulation.png") plt.show() # One-sample t-test tstat, pval = stats.ttest_1samp(opt,popmean=opt[500]) # Weekday print("The t-statistic is {}, and the p-value for weekdays is {}.".format(tstat,pval)) tstat, pval = stats.ttest_1samp(wopt,popmean=wopt[500]) # Weekend print("The t-statistic is {}, and the p-value for weekends is {}.".format(tstat,pval))
def gather_AAIMONs(pathdict, logging, s): """ Gathers the AAIMON ratios and slopes for each protein, created by the run_calculate_AAIMONs scripts. To be compatible with multiprocessing, the run_calculate_AAIMONs script creates a separate output summary file for each protein. The gather_AAIMONs simply concatenates all of these files together Note that the gather_AAIMONs script does not do ANY filtering. This is all done earlier by run_calculate_AAIMONs. It is assumed that for homologues that did not pass the filter (e.g., because they had X in the sequence), that no AAIMON or AAIMON slope was calculated. Parameters ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. logging : logging.Logger Logger for printing to console and logfile. s : dict Settings dictionary extracted from excel settings file. Saved Files and Figures ----------------------- list_cr_summary_csv : csv comma separated csv file with the AAIMON ratios etc contains all data within the {}_cr_mean.csv summary file for each protein pretty_alignments_csv : csv comma separated csv file with the pretty alignments of all the outliers data_characterising_each_homol_TMD.pickle : pickle Raw AAIMON and % identity (or % aa sub rate) datapoints for all TM of all homologues of all proteins Used to create large scatterplot of all datapoints. Returns ------- pathdict : dict Dictionary of the key paths and files associated with that List number. In special cases, the pathdict is modified. """ logging.info( "~~~~~~~~~~~~ starting gather_AAIMONs ~~~~~~~~~~~~" ) df = pd.read_csv(pathdict["list_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC, index_col=0) # drop any proteins without a list of TMDs df = df.loc[df['list_of_TMDs'].notnull()].loc[df['list_of_TMDs'] != 'nan'] # convert list_of_TMDs from string to python list df['list_of_TMDs'] = df.list_of_TMDs.apply(lambda x: ast.literal_eval(x)) ############################################################### # # # Filter keywords # # # ############################################################### if s['filter_keywords_in_gather']: # filter list file by keywords for exclusion analysis, e.g. enzyme only list_number = s['list_number'] # specify allowed and disallowed keywords allowed_KW = ast.literal_eval(s['gather_filter_allowed_keywords']) disallowed_KW = ast.literal_eval(s['gather_filter_forbidden_keywords']) # generate new pathdict base_filename_summaries = os.path.join( s["data_dir"], "summaries", '%02d' % list_number, 'List%02d_filtered' % list_number, ' - '.join(allowed_KW), 'List%02d' % list_number) pathdict = korbinian.common.create_pathdict(base_filename_summaries, s) # create new folder with new pathdict if not os.path.exists(base_filename_summaries[:-7]): os.makedirs(base_filename_summaries[:-7]) # copy keyword column, apply ast.literal_eval to the copied column df['KW'] = df['uniprot_KW'] # apply ast.literal_eval to every item in df['uniprot_KW'] if isinstance(df['KW'][0], str): df['KW'] = df['KW'].apply(lambda x: ast.literal_eval(x)) # get list of enzyme keywords list_enzyme_KW, list_ignored_KW, PFAM_dict = korbinian.cons_ratio.keywords.get_list_enzyme_KW_and_list_ignored_KW( ) # create column per allowed keyword that holds a bool if keyword is present in that protein for KW in allowed_KW: if KW == 'Enzyme': df['Enzyme'] = df['KW'].apply(korbinian.cons_ratio.keywords. KW_list_contains_any_desired_KW, args=(list_enzyme_KW, )) else: df[KW] = df['KW'].apply(korbinian.cons_ratio.keywords. KW_list_contains_any_desired_KW, args=([KW], )) # create column for every protein holding bool if protein contains at least one of the allowed keywords for acc in df.index: df.loc[acc, 'keep'] = df.loc[acc, allowed_KW].any() # drop all proteins whose keywords do not match the requirements df = df.loc[df['keep'] == True] # drop all proteins that contain one of the disallowed keywords for KW in disallowed_KW: if KW == 'Enzyme': df['Enzyme'] = df['KW'].apply(korbinian.cons_ratio.keywords. KW_list_contains_any_desired_KW, args=(list_enzyme_KW, )) else: df[KW] = df['KW'].apply(korbinian.cons_ratio.keywords. KW_list_contains_any_desired_KW, args=([KW], )) df = df.loc[df[KW] == False] # remove copied and edited keyword list df = df.drop('KW', 1) df.to_csv(pathdict["list_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC) ############################################################################# # # # Collate all the "_cr_mean.csv" files into a single dataframe # # # ############################################################################# dfg = pd.DataFrame() # iterate over the dataframe for proteins with an existing list_of_TMDs for acc in df.index: protein_name = df.loc[acc, 'protein_name'] #logging.info(protein_name) sys.stdout.write("{}, ".format(acc)), sys.stdout.flush() if not os.path.exists(df.loc[acc, 'homol_cr_ratios_zip']): logging.info( "{} skipped. homol_cr_ratios_zip does not exist".format(acc)) continue if utils.file_is_old(df.loc[acc, 'homol_cr_ratios_zip'], s["oldest_acceptable_file_date"]): os.remove(df.loc[acc, 'homol_cr_ratios_zip']), logging.info( "{} skipped, file is old and has been deleted".format(acc)) continue # open csv as pandas dataframe (note, it was originally a series, and contains only one column and an index) # set delete_corrupt=True so that if the expected csv is not in the zip, the wholezipfile will be deleted mean_ser_filename = "{}_cr_mean.csv".format(acc) mean_ser = utils.open_df_from_csv_zip(df.loc[acc, 'homol_cr_ratios_zip'], filename=mean_ser_filename, delete_corrupt=True) dfg = pd.concat([dfg, mean_ser], axis=1) if dfg.empty: raise ValueError( "\n\ndfg is an empty dataframe.\nThis means that none of the proteins had any correctly processed conservation ratios.\nSuggest checking the output of all previous steps." ) # transpose dataframe (flip index and columns) dfg = dfg.T.copy() # for the OMPdb dataset, there is no uniprot_entry_name uniprot_entry_name_in_df = "uniprot_entry_name" in df.columns if not uniprot_entry_name_in_df: dfg['uniprot_entry_name'] = "OMPdb_dataset" # drop any proteins in dfg without a list of TMDs dfg = dfg.loc[df['list_of_TMDs'].notnull()].loc[ dfg['list_of_TMDs'] != 'nan'] # if the list_of_TMDs is a stringlist, convert to a python list dfg['list_of_TMDs'] = dfg['list_of_TMDs'].dropna().apply( lambda x: ast.literal_eval(x)) # # for singlepass datasets, leave row blank by default # dfg['AAIMON_slope_central_TMDs'] = np.nan # CONVERT STRINGS TO FLOATS FOR SELECTED COLUMNS # note that after saving dfg to CSV, pandas then gets the dtype correct upon reopening for figs.py etc cols_to_convert = [ "AAIMON_mean_all_TM_res", "AAIMON_n_mean_all_TM_res", "AAIMON_slope_all_TM_res", "AAIMON_n_slope_all_TM_res", 'AAIMON_n_homol' ] for col in cols_to_convert: dfg[col] = pd.to_numeric(dfg[col]) # print out mean AAIMON values in dataset mean_AAIMON_in_dataset = dfg['AAIMON_mean_all_TM_res'].mean() mean_AAIMON_n_in_dataset = dfg['AAIMON_n_mean_all_TM_res'].mean() mean_AAIMON_slope_in_dataset = dfg['AAIMON_slope_all_TM_res'].mean() mean_AAIMON_n_slope_in_dataset = dfg['AAIMON_n_slope_all_TM_res'].mean() sys.stdout.write('\n\nmean AAIMON in dataset: {a:.05f}' '\nmean AAIMON_n in dataset: {b:.05f}' '\nmean AAIMON_slope in dataset: {c:.07f}' '\nmean AAIMON_n_slope in dataset: {d:.07f}\n'.format( a=mean_AAIMON_in_dataset, b=mean_AAIMON_n_in_dataset, c=mean_AAIMON_slope_in_dataset, d=mean_AAIMON_n_slope_in_dataset)) dfg.to_csv(pathdict["list_cr_summary_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC) ######################################################################################## # # # Save a huge dataframe with the AAIMONs for # # all homologues of all TMDs of all proteins # # # ######################################################################################## if s['save_df_characterising_each_homol_TMD']: # defining cutoff for max and min number of homologues for each protein min_num_homologues = s['min_homol'] # filter summary file for min and max number of homologues based on TM01 number of homologues #sys.stdout.write('Dropped homologues after filtering: \n') list_of_acc_to_keep = [] for acc in dfg.index: AAIMON_n_homol = dfg.loc[acc, 'AAIMON_n_homol'] if AAIMON_n_homol > min_num_homologues: list_of_acc_to_keep.append(acc) # keep only proteins that have the desired number of homologues dfg = dfg.loc[list_of_acc_to_keep, :] df = df.loc[list_of_acc_to_keep, :] # # convert from string to python list # if isinstance(dfg['list_of_TMDs'][0], str): # dfg['list_of_TMDs'] = dfg['list_of_TMDs'].dropna().apply(lambda x: ast.literal_eval(x)) #sys.stdout.write("\nLoading data\n") # initiate empty numpy array data = np.empty([0, 3]) # navigate through filesystem and open pickles from .zip n = 0 for acc in dfg.index: n += 1 if n % 20 == 0: sys.stdout.write('.'), sys.stdout.flush() if n % 600 == 0: sys.stdout.write('\n'), sys.stdout.flush() protein_name = df.loc[acc, "protein_name"] homol_cr_ratios_zip = df.loc[acc, "homol_cr_ratios_zip"] # Here we filter to take only datapoints where all TMDs were in the alignment AAIMON_all_TMD = protein_name + '_AAIMON_all_TMD.csv' df_AAIMON_all_TMD = utils.open_df_from_csv_zip( homol_cr_ratios_zip, filename=AAIMON_all_TMD, delete_corrupt=False) ######################################################################################## # # # CODE COPIED FROM cons_ratio.py. Delete the following two lines after # # re-running all calculated cons ratios # # # ######################################################################################## # first get a list of all the homologues that have AAIMON ratios for all TMDs df_AAIMON_all_TMD[ "AAIMON_avail_all_TMDs"] = df_AAIMON_all_TMD.n_TMDs_with_measurable_AAIMON == df.loc[ acc, "number_of_TMDs"] filt_index = df_AAIMON_all_TMD.loc[ df_AAIMON_all_TMD["AAIMON_avail_all_TMDs"] == True].index.tolist() #filt_index = [int(x) for x in filt_index] if not os.path.isfile(homol_cr_ratios_zip): # skip to next protein continue for TMD in df.loc[acc, "list_of_TMDs"]: # generate column names necessary for current file columns = [ 'obs_changes', '{}_AAIMON'.format(TMD), '{}_AAIMON_n'.format(TMD) ] # Open pickle file with conservation-ratios. # NOTE that these have already been filtered according to cons_ratio.py. # if the homologues were not acceptable, AAIMON ratios WERE NOT CALCULATED TM_cr_pickle = "{}_{}_cr_df.pickle".format(protein_name, TMD) # open dataframe with function from korbinian, extract required columns, convert to np array df_TMD = utils.open_df_from_pickle_zip(homol_cr_ratios_zip, TM_cr_pickle) if columns[2] not in df_TMD.columns: # file is old, and should be deleted #os.remove(homol_cr_ratios_zip) logging.info( "{} file is presumed out of date, and has been deleted" .format(homol_cr_ratios_zip)) os.remove(homol_cr_ratios_zip) # skip to next protein break if set(filt_index).intersection(set(df_TMD.index)) == set(): # there is a mismatch between the filt_index for df_AAIMON_all_TMD, and the columns in df_TMD # replace filt_index with empty list logging.warning( "Indexing Error in gather script. set(filt_index).intersection(set(df_TMD.index)) == set(). Try re-running calculate_AAIMON_ratios" ) filt_index = [] # use the filt_index above that shows homologues with AAIMON available for all TMDs df_TMD = df_TMD.loc[filt_index, :] # convert to numpy array df_TMD = df_TMD[columns].as_matrix() # join output data file with currently opened dataframe data = np.concatenate((data, df_TMD)) # drop every row with nan data = data[~np.isnan(data).any(axis=1)] # create bins, calculate mean and confidence interval in bin - use multiprocessing if possible sys.stdout.write('\nBinning data - calculating confidence interval\n') number_of_bins = s['specify_number_of_bins_characterising_TMDs'] # process confidence interval value to appropriate input format for function confidence_interval = (100 - s['CI']) / 100 linspace_binlist = np.linspace(1, 100, number_of_bins) binned_data = np.empty([0, 8]) binwidth = 100 / number_of_bins for percentage in linspace_binlist: bin_for_mean = data[(percentage >= data[:, 0]) & (data[:, 0] > percentage - binwidth)] if bin_for_mean.size != 0: # calculate conf. interv. in bin, alpha describes the significance level in the style 1-alpha conf = sms.DescrStatsW( bin_for_mean[:, 1]).tconfint_mean(alpha=confidence_interval) # calculate conf. interv. in bin _n, alpha describes the significance level in the style 1-alpha conf_norm = sms.DescrStatsW( bin_for_mean[:, 2]).tconfint_mean(alpha=confidence_interval) mean_data_in_bin = np.array([ percentage - binwidth / 2, # calculate mean in bin bin_for_mean[:, 1].mean(), # calculate mean in bin _n bin_for_mean[:, 2].mean(), # add conf. interv. results to np array conf[0], conf[1], conf_norm[0], conf_norm[1], # add the number of TMDs in bin to bin len(bin_for_mean[:, 0]) ]) mean_data_in_bin = mean_data_in_bin.reshape(1, 8) sys.stdout.write('.'), sys.stdout.flush() binned_data = np.concatenate( (mean_data_in_bin.reshape(1, 8), binned_data)) # # create bins, calculate mean and confidence interval in bin - use multiprocessing if possible # sys.stdout.write('\nBinning data - calculating confidence interval\n') # # use_multiprocessing = s['use_multiprocessing'] # n_processes = s['multiprocessing_cores'] # remove_from_binlist = int((1 - s['fa_min_identity_of_full_protein']) * 100) # number_of_bins = s['specify_number_of_bins_characterising_TMDs'] # confidence_interval = (100 - s['CI']) / 100 # linspace_binlist = np.linspace(1, 100, number_of_bins)[:remove_from_binlist] # binned_data = np.empty([0, 8]) # binwidth = 100 / number_of_bins # list_p = [] # for percentage in linspace_binlist: # data_as_dict = {'data': data, 'percentage': percentage, 'binwidth': binwidth, 'confidence_interval': confidence_interval} # list_p.append(data_as_dict) # # if use_multiprocessing: # with Pool(processes=n_processes) as pool: # mean_data_in_bin = pool.map(binning_data_multiprocessing, list_p) # else: # mean_data_in_bin = [] # for p in list_p: # output = binning_data_multiprocessing(p) # if type(output) is np.ndarray: # mean_data_in_bin.append(output) # # for n, element in enumerate(mean_data_in_bin): # if type(mean_data_in_bin[n]) is np.ndarray: # binned_data = np.concatenate((mean_data_in_bin[n].reshape(1, 8), binned_data)) # create bins, calculate mean and 95% confidence interval # sys.stdout.write('\nBinning data - calculating confidence interval\n') # confidence_interval = (100 - s['CI'])/100 # number_of_bins = s['specify_number_of_bins_characterising_TMDs'] # linspace_binlist = np.linspace(1, 100, number_of_bins) # binwidth = 100/number_of_bins # binned_data = np.empty([0, 8]) # # conf_95 = np.array([1, 2]) # # conf95_norm = np.array([1, 2]) # for percentage in linspace_binlist: # if percentage % 5 == 0: # sys.stdout.write('{}%, '.format(int(percentage))), sys.stdout.flush() # bin_for_mean = np.empty([0, 3]) # for row in data: # if row[0] < percentage and row[0] > percentage - binwidth: # bin_for_mean = np.concatenate((bin_for_mean, row.reshape(1, 3))) # if bin_for_mean.size != 0: # # calculate conf. interv. in bin, alpha describes the significance level in the style 1-alpha # conf = sms.DescrStatsW(bin_for_mean[:, 1]).tconfint_mean(alpha=confidence_interval) # # calculate conf. interv. in bin _n, alpha describes the significance level in the style 1-alpha # conf_norm = sms.DescrStatsW(bin_for_mean[:, 2]).tconfint_mean(alpha=confidence_interval) # mean_data_in_bin = np.array([percentage - binwidth/2, # # calculate mean in bin # bin_for_mean[:, 1].mean(), # # calculate mean in bin _n # bin_for_mean[:, 2].mean(), # # add conf. interv. results to np array # conf[0], conf[1], conf_norm[0], conf_norm[1], # # add the number of TMDs in bin to bin # len(bin_for_mean[:, 0])]) # # merge data from bin to the others # binned_data = np.concatenate((mean_data_in_bin.reshape(1, 8), binned_data)) # # drop every row containing nan in array # binned_data = binned_data[~np.isnan(binned_data).any(axis=1)] ''' description of columns in numpy arrays: numpy array data: | 0 | 1 | 2 | | % obs_changes | AAIMON | AAIMON_n | numpy array binned_Data: | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | % obs_changes | mean AAIMON | mean AAIMON_n | CI_low | CI_hi | CI_low_n | CI_hi_n | number of Proteins | ''' # save data and binned_data as zipped pickle with zipfile.ZipFile(pathdict['save_df_characterising_each_homol_TMD'], mode="w", compression=zipfile.ZIP_DEFLATED) as zipout: # save dataframe "data_filt" as pickle with open('data_characterising_each_homol_TMD.pickle', "wb") as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) zipout.write('data_characterising_each_homol_TMD.pickle', arcname='data_characterising_each_homol_TMD.pickle') os.remove('data_characterising_each_homol_TMD.pickle') # save dataframe "binned_data" as pickle with open('binned_data_characterising_each_homol_TMD.pickle', "wb") as f: pickle.dump(binned_data, f, protocol=pickle.HIGHEST_PROTOCOL) zipout.write( 'binned_data_characterising_each_homol_TMD.pickle', arcname='binned_data_characterising_each_homol_TMD.pickle') os.remove('binned_data_characterising_each_homol_TMD.pickle') logging.info( "\n~~~~~~~~~~~~ finished gather_AAIMONs ~~~~~~~~~~~~" ) return pathdict
plt.ylabel('Median value of owner-occupied homes in $1000s') plt.show() print('\n\nPart 2\n---------------------------') print( "The null hypothesis examines the data to find how often we could get the same data randomly" ) print( "Normally, we reject the null hypothesis if by random we could only get that result less than 5% of the time." ) dfchas1 = df.MEDV[df.CHAS == 1] dfchas0 = df.MEDV[df.CHAS == 0] ttest, pval = stats.ttest_ind(dfchas1, dfchas0) print('P-val: ', pval, 'ttset value', ttest) means = ws.CompareMeans(ws.DescrStatsW(dfchas1), ws.DescrStatsW(dfchas0)) confint = means.tconfint_diff(alpha=0.05, alternative='two-sided', usevar='unequal') print('Confidence interval:', confint[0], confint[1]) ratio = len(dfchas0) / len(dfchas1) gsize = tt_ind_solve_power(effect_size=0.6, nobs1=None, alpha=0.05, power=0.8, ratio=ratio, alternative='two-sided') print( 'Assume an effect size (Cohen’s d) of 0.6. If you want 80% power, what group size is necessary?', gsize)