예제 #1
0
def simulate(agent, env, start=None, episodes=1_000_000):
    """
    Simulate an agent in an environment over a number of episodes.

    Args:
        agent (object): an agent with an act(obs, reward, done) method.
        env (object): an OpenAI Gym environment.
        episodes (int): the number of episodes to simulate. Defaults to 1,000,000.

    Returns:
        A DescrStatsW object with the episodic reward distribution.

    Notes:
        A progress bar is shown.
    """
    hist = defaultdict(int)
    for _ in tqdm(range(episodes)):
        total = 0.
        obs = env.reset() if start is None else env.explore(start)
        reward, done = 0., False
        while True:
            action = agent.act(obs, reward, done)
            obs, reward, done, _ = env.step(action)
            total += reward
            if done:
                hist[total] += 1
                break
    stats = ssw.DescrStatsW(
        data=list(hist.keys()),
        weights=list(hist.values())
    )
    return stats
예제 #2
0
def t_test_two_samp(a, b, alpha, alternative='two-sided'):
    diff = a.mean() - b.mean()

    res = ss.ttest_ind(a, b)

    means = ws.CompareMeans(ws.DescrStatsW(a), ws.DescrStatsW(b))
    confint = means.tconfint_diff(alpha=alpha,
                                  alternative=alternative,
                                  usevar='unequal')
    degfree = means.dof_satt()

    index = [
        'DegFreedom', 'Difference', 'Statistic', 'PValue', 'Low95CI',
        'High95CI'
    ]
    return pd.Series([degfree, diff, res[0], res[1], confint[0], confint[1]],
                     index=index)
예제 #3
0
def test_ttest():
    x1, x2 = clinic[:15, 2], clinic[15:, 2]
    all_tests = []
    t1 = smws.ttest_ind(x1, x2, alternative='larger', usevar='unequal')
    all_tests.append((t1, ttest_clinic_indep_1_g))
    t2 = smws.ttest_ind(x1, x2, alternative='smaller', usevar='unequal')
    all_tests.append((t2, ttest_clinic_indep_1_l))
    t3 = smws.ttest_ind(x1,
                        x2,
                        alternative='smaller',
                        usevar='unequal',
                        value=1)
    all_tests.append((t3, ttest_clinic_indep_1_l_mu))

    for res1, res2 in all_tests:
        assert_almost_equal(res1[0], res2.statistic, decimal=13)
        assert_almost_equal(res1[1], res2.p_value, decimal=13)
        #assert_almost_equal(res1[2], res2.df, decimal=13)

    cm = smws.CompareMeans(smws.DescrStatsW(x1), smws.DescrStatsW(x2))
    ci = cm.tconfint_diff(alternative='two-sided', usevar='unequal')
    assert_almost_equal(ci, ttest_clinic_indep_1_two_mu.conf_int, decimal=13)
    ci = cm.tconfint_diff(alternative='two-sided', usevar='pooled')
    assert_almost_equal(ci,
                        ttest_clinic_indep_1_two_mu_pooled.conf_int,
                        decimal=13)
    ci = cm.tconfint_diff(alternative='smaller', usevar='unequal')
    assert_almost_equal_inf(ci, ttest_clinic_indep_1_l.conf_int, decimal=13)
    ci = cm.tconfint_diff(alternative='larger', usevar='unequal')
    assert_almost_equal_inf(ci, ttest_clinic_indep_1_g.conf_int, decimal=13)

    #test get_compare
    cm = smws.CompareMeans(smws.DescrStatsW(x1), smws.DescrStatsW(x2))
    cm1 = cm.d1.get_compare(cm.d2)
    cm2 = cm.d1.get_compare(x2)
    cm3 = cm.d1.get_compare(np.hstack((x2, x2)))
    #all use the same d1, no copying
    assert_(cm.d1 is cm1.d1)
    assert_(cm.d1 is cm2.d1)
    assert_(cm.d1 is cm3.d1)
예제 #4
0
    def setup_class(cls):
        cls.res2 = tost_clinic_paired_1
        x1, x2 = clinic[:15, 2], clinic[15:, 2]
        cls.res1 = Holder()
        res = smws.ttost_paired(x1, x2, -0.6, 0.6, transform=None)
        cls.res1.pvalue = res[0]
        #cls.res1.df = res[1][-1] not yet
        res_ds = smws.DescrStatsW(x1 - x2, weights=None, ddof=0)
        #tost confint 2*alpha TODO: check again
        cls.res1.tconfint_diff = res_ds.tconfint_mean(0.1)
        cls.res1.confint_05 = res_ds.tconfint_mean(0.05)
        cls.res1.mean_diff = res_ds.mean
        cls.res1.std_mean_diff = res_ds.std_mean

        cls.res2b = ttest_clinic_paired_1
예제 #5
0
    def __init__(self):
        self.res2 = tost_clinic_paired_1
        x1, x2 = clinic[:15, 2], clinic[15:, 2]
        self.res1 = Holder()
        res = smws.tost_paired(x1, x2, -0.6, 0.6, transform=None)
        self.res1.pvalue = res[0]
        #self.res1.df = res[1][-1] not yet
        res_ds = smws.DescrStatsW(x1 - x2, weights=None, ddof=0)
        #tost confint 2*alpha TODO: check again
        self.res1.confint_diff = res_ds.confint_mean(0.1)
        self.res1.confint_05 = res_ds.confint_mean(0.05)
        self.res1.mean_diff = res_ds.mean
        self.res1.std_mean_diff = res_ds.std_mean

        self.res2b = ttest_clinic_paired_1
예제 #6
0
def play(env, episodes=100, hint=False):
    """
    Play an interactive session of blackjack.

    Args:
        env (object): an OpenAI Gym blackjack environment. Defaults to 'Blackjack-v1'.
        episodes (int): the number of episodes to play. Defaults to 100.
        hint (bool): whether to show the Basic Strategy (Thorp, 1966) as a hint. Defaults to False.

    Returns:
        A DescrStatsW object with the episodic reward distribution.

    Notes:
        The user can input either 0/1 or s/h (for stand/hit) as actions.
        If no valid input is entered, the default action is either 0 (if hint=False) or the basic strategy (if hint=True).
    """
    if hint:
        agent = BasicStrategyAgent(env)
    hist = defaultdict(int)
    for _ in range(episodes):
        total = 0.
        obs, reward, done = env.reset(), 0., False
        while True:
            print(env.render(), end=' ')
            if hint:
                a0 = agent.act(obs, reward, done)
                k = input(f'action: [hint: {action_labels[a0].lower()}] ')
            else:
                a0 = 0
                k = input(f'action: ')
            try:
                a = Action(int(k))
            except ValueError:
                try:
                    a = Action[action_labels.index(k.upper())]
                except KeyError:
                    a = a0
            obs, reward, done, _ = env.step(a)
            total += reward
            if done:
                hist[total] += 1
                print(env.render())
                break
    stats = ssw.DescrStatsW(data=list(hist.keys()),
                            weights=list(hist.values()))
    return stats
예제 #7
0
            'rsq': new_rsq[indices4plot]
        })
    # sort values by eccentricity
    df = df.sort_values(by=['ecc'])

    bin_size = int(len(df) / n_bins)  #divide in equally sized bins
    mean_ecc = []
    mean_ecc_std = []
    mean_size = []
    mean_size_std = []
    for j in range(
            n_bins
    ):  # for each bin calculate rsq-weighted means and errors of binned ecc/size
        mean_size.append(
            weightstats.DescrStatsW(
                df[bin_size * j:bin_size * (j + 1)]['size'],
                weights=df[bin_size * j:bin_size * (j + 1)]['rsq']).mean)
        mean_size_std.append(
            weightstats.DescrStatsW(
                df[bin_size * j:bin_size * (j + 1)]['size'],
                weights=df[bin_size * j:bin_size * (j + 1)]['rsq']).std_mean)
        mean_ecc.append(
            weightstats.DescrStatsW(df[bin_size * j:bin_size * (j + 1)]['ecc'],
                                    weights=df[bin_size * j:bin_size *
                                               (j + 1)]['rsq']).mean)
        mean_ecc_std.append(
            weightstats.DescrStatsW(df[bin_size * j:bin_size * (j + 1)]['ecc'],
                                    weights=df[bin_size * j:bin_size *
                                               (j + 1)]['rsq']).std_mean)

    if idx == 0:
예제 #8
0
# specifying the timing of fMRI frames
frame_times = TR * (np.arange(data.shape[-1]))
# Create the design matrix, hrf model containing Glover model
design_matrix = make_first_level_design_matrix(frame_times,
                                               events=events_avg,
                                               hrf_model='glover')

regressors = np.array(design_matrix.keys()).astype(str)
# make and save average beta weights for different ROIs
for idx, roi in enumerate(ROIs):
    # plot beta weights for single voxel
    fig, axis = plt.subplots(1, figsize=(25, 7.5), dpi=100)

    # get the average values
    # weight voxels based on rsq of fit
    beta_avg = weightstats.DescrStatsW(betas[roi_verts[roi]],
                                       weights=rsq[roi_verts[roi]]).mean

    # need to do this to get weighted standard deviations of the mean for each regressor
    beta_std = []
    for w in range(len(regressors)):
        beta_pred = np.array([
            betas[roi_verts[roi]][x][w]
            for x in range(len(betas[roi_verts[roi]]))
        ])
        beta_std.append(
            weightstats.DescrStatsW(beta_pred,
                                    weights=rsq[roi_verts[roi]]).std_mean)

    y_pos = np.arange(len(regressors))
    plt.bar(y_pos, beta_avg, yerr=np.array(beta_std), align='center')
    plt.xticks(y_pos, regressors)
예제 #9
0
def test_mv_mean():
    # names = ['id', 'mpg1', 'mpg2', 'add']
    x = np.asarray([[1.0, 24.0, 23.5, 1.0], [2.0, 25.0, 24.5, 1.0],
                    [3.0, 21.0, 20.5, 1.0], [4.0, 22.0, 20.5, 1.0],
                    [5.0, 23.0, 22.5, 1.0], [6.0, 18.0, 16.5, 1.0],
                    [7.0, 17.0, 16.5, 1.0], [8.0, 28.0, 27.5, 1.0],
                    [9.0, 24.0, 23.5, 1.0], [10.0, 27.0, 25.5, 1.0],
                    [11.0, 21.0, 20.5, 1.0], [12.0, 23.0, 22.5, 1.0],
                    [1.0, 20.0, 19.0, 0.0], [2.0, 23.0, 22.0, 0.0],
                    [3.0, 21.0, 20.0, 0.0], [4.0, 25.0, 24.0, 0.0],
                    [5.0, 18.0, 17.0, 0.0], [6.0, 17.0, 16.0, 0.0],
                    [7.0, 18.0, 17.0, 0.0], [8.0, 24.0, 23.0, 0.0],
                    [9.0, 20.0, 19.0, 0.0], [10.0, 24.0, 22.0, 0.0],
                    [11.0, 23.0, 22.0, 0.0], [12.0, 19.0, 18.0, 0.0]])

    res = smmv.test_mvmean(x[:, 1:3], [21, 21])

    res_stata = Holder(p_F=1.25062334808e-09,
                       df_r=22,
                       df_m=2,
                       F=59.91609589041116,
                       T2=125.2791095890415)

    assert_allclose(res.statistic, res_stata.F, rtol=1e-10)
    assert_allclose(res.pvalue, res_stata.p_F, rtol=1e-10)
    assert_allclose(res.t2, res_stata.T2, rtol=1e-10)
    assert_equal(res.df, [res_stata.df_m, res_stata.df_r])

    # diff of paired sample
    mask = x[:, -1] == 1
    x1 = x[mask, 1:3]
    x0 = x[~mask, 1:3]
    res_p = smmv.test_mvmean(x1 - x0, [0, 0])

    # result Stata hotelling
    res_stata = Holder(
        T2=9.698067632850247,
        df=10,
        k=2,
        N=12,
        F=4.4082126,  # not in return List
        p_F=0.0424)  # not in return List

    res = res_p
    assert_allclose(res.statistic, res_stata.F, atol=5e-7)
    assert_allclose(res.pvalue, res_stata.p_F, atol=5e-4)
    assert_allclose(res.t2, res_stata.T2, rtol=1e-10)
    assert_equal(res.df, [res_stata.k, res_stata.df])

    # mvtest means diff1 diff2, zero
    res_stata = Holder(p_F=.0423949782937231,
                       df_r=10,
                       df_m=2,
                       F=4.408212560386478,
                       T2=9.69806763285025)

    assert_allclose(res.statistic, res_stata.F, rtol=1e-12)
    assert_allclose(res.pvalue, res_stata.p_F, rtol=1e-12)
    assert_allclose(res.t2, res_stata.T2, rtol=1e-12)
    assert_equal(res.df, [res_stata.df_m, res_stata.df_r])

    dw = weightstats.DescrStatsW(x)
    ci0 = dw.tconfint_mean(alpha=0.05)

    nobs = len(x[:, 1:])
    ci1 = confint_mvmean_fromstats(dw.mean,
                                   np.diag(dw.var),
                                   nobs,
                                   lin_transf=np.eye(4),
                                   alpha=0.05)
    ci2 = confint_mvmean_fromstats(dw.mean,
                                   dw.cov,
                                   nobs,
                                   lin_transf=np.eye(4),
                                   alpha=0.05)

    assert_allclose(ci1[:2], ci0, rtol=1e-13)
    assert_allclose(ci2[:2], ci0, rtol=1e-13)

    # test from data
    res = smmv.confint_mvmean(x, lin_transf=np.eye(4), alpha=0.05)
    assert_allclose(res, ci2, rtol=1e-13)
예제 #10
0
def RunSimulation2(weekday="Monday", weekend="Saturday"):
    # Initialise list of optimal solution routes
    weekRoutes = pd.read_csv("Data" + sep + "Routes" + sep + "optimalRoutes" + weekday + ".csv", converters = {"Route": literal_eval})
    wkndRoutes = pd.read_csv("Data" + sep + "Routes" + sep + "optimalRoutes" + weekend + ".csv", converters = {"Route": literal_eval})

    # Get cleaned dataframe
    demandData = pd.read_csv("Data" + sep + "demandData.csv")
    locationData = pd.read_csv("Data" + sep + "FoodstuffLocations.csv")
    demandData = clean_data(demandData,locationData)
    locationData.loc[locationData["Supermarket"] == "Fresh Collective Alberton", "Type"] = "Four Square"
    [a,b,c] = setupbootstrap(demandData,wknd=False)
    [d,e,f] = setupbootstrap(demandData,wknd=True)

    # 1000 runs for monte carlo simulation for weekday
    wopt = []
    opt = []
    for i in range(1000):
        # Import travel times between supermarkets + demand predictions per store
        opt.append(simulate(weekRoutes, locationData, a, b, c, wknd=False))
        wopt.append(simulate(wkndRoutes, locationData, d, e, f, wknd=True))

    # Seaborn plot 
    ax = sns.distplot(wopt, bins=100)
    ax.set_title("Weekend Optimal Solution")
    ax.set_xlabel("Optimal Solution Value ($)")
    ax.set_ylabel("Probablity")
    plt.savefig("Pictures/wopt.png")
    plt.show()
    ax = sns.distplot(opt, bins=100)
    ax.set_title("Weekday Optimal Solution")
    ax.set_xlabel("Optimal Solution Value ($)")
    ax.set_ylabel("Probablity")
    plt.savefig("Pictures/opt.png")
    plt.show()

    # Calculate confidence intervals 
    optLower, optUpper = sms.DescrStatsW(opt).tconfint_mean(alpha = 0.05)
    woptLower, woptUpper = sms.DescrStatsW(wopt).tconfint_mean(alpha = 0.05)

    print("Mean and CI of Weekday")
    print(statistics.mean(opt))
    print(optLower, optUpper)

    print("Mean and CI of Weekend")
    print(statistics.mean(wopt))
    print(woptLower, woptUpper)

    # Sorting and getting prediction intervals for the optimal solution
    opt.sort()
    wopt.sort()

    # Histograms for optimal solutions, on weekday and weekend
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,10))
    ax1.hist(opt, density=True, bins = 100, histtype='stepfilled', alpha=0.2)
    ax1.axvline(x=optLower, color='r', linewidth=2)
    ax1.axvline(x=optUpper, color='r', linewidth=2)
    ax1.axvline(x=opt[25], color='b', linewidth=2)
    ax1.axvline(x=opt[975], color='b', linewidth=2)
    ax1.set_title("Weekday Optimal Solution")
    ax1.set_xlabel("Optimal Solution Value ($)")
    ax1.set_ylabel("Probablity")
    ax2.hist(wopt, density=True, bins = 100, histtype='stepfilled', alpha=0.2)
    ax2.axvline(x=woptLower, color='r', linewidth=2)
    ax2.axvline(x=woptUpper, color='r', linewidth=2)
    ax2.axvline(x=wopt[25], color='b', linewidth=2)
    ax2.axvline(x=wopt[975], color='b', linewidth=2)
    ax2.set_title("Weekend Optimal Solution")
    ax2.set_xlabel("Optimal Solution Value ($)")
    ax2.set_ylabel("Probablity")
    plt.savefig("Pictures/Simulation.png")
    plt.show()

    # One-sample t-test
    tstat, pval = stats.ttest_1samp(opt,popmean=opt[500]) # Weekday
    print("The t-statistic is {}, and the p-value for weekdays is {}.".format(tstat,pval))
    tstat, pval = stats.ttest_1samp(wopt,popmean=wopt[500]) # Weekend
    print("The t-statistic is {}, and the p-value for weekends is {}.".format(tstat,pval))
예제 #11
0
def gather_AAIMONs(pathdict, logging, s):
    """ Gathers the AAIMON ratios and slopes for each protein, created by the run_calculate_AAIMONs scripts.

    To be compatible with multiprocessing, the run_calculate_AAIMONs script creates a separate output summary file
    for each protein. The gather_AAIMONs simply concatenates all of these files together

    Note that the gather_AAIMONs script does not do ANY filtering. This is all done earlier by run_calculate_AAIMONs.
    It is assumed that for homologues that did not pass the filter (e.g., because they had X in the sequence),
    that no AAIMON or AAIMON slope was calculated.

    Parameters
    ----------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
    logging : logging.Logger
        Logger for printing to console and logfile.
    s : dict
        Settings dictionary extracted from excel settings file.

    Saved Files and Figures
    -----------------------
    list_cr_summary_csv : csv
        comma separated csv file with the AAIMON ratios etc
        contains all data within the {}_cr_mean.csv summary file for each protein
    pretty_alignments_csv : csv
        comma separated csv file with the pretty alignments of all the outliers
    data_characterising_each_homol_TMD.pickle : pickle
        Raw AAIMON and % identity (or % aa sub rate) datapoints for all TM of all homologues of all proteins
        Used to create large scatterplot of all datapoints.

    Returns
    -------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
        In special cases, the pathdict is modified.
    """
    logging.info(
        "~~~~~~~~~~~~                           starting gather_AAIMONs                      ~~~~~~~~~~~~"
    )
    df = pd.read_csv(pathdict["list_csv"],
                     sep=",",
                     quoting=csv.QUOTE_NONNUMERIC,
                     index_col=0)
    # drop any proteins without a list of TMDs
    df = df.loc[df['list_of_TMDs'].notnull()].loc[df['list_of_TMDs'] != 'nan']
    # convert list_of_TMDs from string to python list
    df['list_of_TMDs'] = df.list_of_TMDs.apply(lambda x: ast.literal_eval(x))

    ###############################################################
    #                                                             #
    #                        Filter keywords                      #
    #                                                             #
    ###############################################################
    if s['filter_keywords_in_gather']:
        # filter list file by keywords for exclusion analysis, e.g. enzyme only
        list_number = s['list_number']
        # specify allowed and disallowed keywords
        allowed_KW = ast.literal_eval(s['gather_filter_allowed_keywords'])
        disallowed_KW = ast.literal_eval(s['gather_filter_forbidden_keywords'])
        # generate new pathdict
        base_filename_summaries = os.path.join(
            s["data_dir"], "summaries", '%02d' % list_number,
            'List%02d_filtered' % list_number, ' - '.join(allowed_KW),
            'List%02d' % list_number)
        pathdict = korbinian.common.create_pathdict(base_filename_summaries, s)
        # create new folder with new pathdict
        if not os.path.exists(base_filename_summaries[:-7]):
            os.makedirs(base_filename_summaries[:-7])

        # copy keyword column, apply ast.literal_eval to the copied column
        df['KW'] = df['uniprot_KW']
        # apply ast.literal_eval to every item in df['uniprot_KW']
        if isinstance(df['KW'][0], str):
            df['KW'] = df['KW'].apply(lambda x: ast.literal_eval(x))
        # get list of enzyme keywords
        list_enzyme_KW, list_ignored_KW, PFAM_dict = korbinian.cons_ratio.keywords.get_list_enzyme_KW_and_list_ignored_KW(
        )
        # create column per allowed keyword that holds a bool if keyword is present in that protein
        for KW in allowed_KW:
            if KW == 'Enzyme':
                df['Enzyme'] = df['KW'].apply(korbinian.cons_ratio.keywords.
                                              KW_list_contains_any_desired_KW,
                                              args=(list_enzyme_KW, ))
            else:
                df[KW] = df['KW'].apply(korbinian.cons_ratio.keywords.
                                        KW_list_contains_any_desired_KW,
                                        args=([KW], ))
        # create column for every protein holding bool if protein contains at least one of the allowed keywords
        for acc in df.index:
            df.loc[acc, 'keep'] = df.loc[acc, allowed_KW].any()
        # drop all proteins whose keywords do not match the requirements
        df = df.loc[df['keep'] == True]

        # drop all proteins that contain one of the disallowed keywords
        for KW in disallowed_KW:
            if KW == 'Enzyme':
                df['Enzyme'] = df['KW'].apply(korbinian.cons_ratio.keywords.
                                              KW_list_contains_any_desired_KW,
                                              args=(list_enzyme_KW, ))
            else:
                df[KW] = df['KW'].apply(korbinian.cons_ratio.keywords.
                                        KW_list_contains_any_desired_KW,
                                        args=([KW], ))
            df = df.loc[df[KW] == False]
        # remove copied and edited keyword list
        df = df.drop('KW', 1)

        df.to_csv(pathdict["list_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC)

    #############################################################################
    #                                                                           #
    #       Collate all the "_cr_mean.csv" files into a single dataframe        #
    #                                                                           #
    #############################################################################

    dfg = pd.DataFrame()
    # iterate over the dataframe for proteins with an existing list_of_TMDs
    for acc in df.index:
        protein_name = df.loc[acc, 'protein_name']
        #logging.info(protein_name)
        sys.stdout.write("{}, ".format(acc)), sys.stdout.flush()
        if not os.path.exists(df.loc[acc, 'homol_cr_ratios_zip']):
            logging.info(
                "{} skipped. homol_cr_ratios_zip does not exist".format(acc))
            continue

        if utils.file_is_old(df.loc[acc, 'homol_cr_ratios_zip'],
                             s["oldest_acceptable_file_date"]):
            os.remove(df.loc[acc, 'homol_cr_ratios_zip']),
            logging.info(
                "{} skipped, file is old and has been deleted".format(acc))
            continue

        # open csv as pandas dataframe (note, it was originally a series, and contains only one column and an index)
        # set delete_corrupt=True so that if the expected csv is not in the zip, the wholezipfile will be deleted
        mean_ser_filename = "{}_cr_mean.csv".format(acc)
        mean_ser = utils.open_df_from_csv_zip(df.loc[acc,
                                                     'homol_cr_ratios_zip'],
                                              filename=mean_ser_filename,
                                              delete_corrupt=True)
        dfg = pd.concat([dfg, mean_ser], axis=1)

    if dfg.empty:
        raise ValueError(
            "\n\ndfg is an empty dataframe.\nThis means that none of the proteins had any correctly processed conservation ratios.\nSuggest checking the output of all previous steps."
        )

    # transpose dataframe (flip index and columns)
    dfg = dfg.T.copy()

    # for the OMPdb dataset, there is no uniprot_entry_name
    uniprot_entry_name_in_df = "uniprot_entry_name" in df.columns
    if not uniprot_entry_name_in_df:
        dfg['uniprot_entry_name'] = "OMPdb_dataset"

    # drop any proteins in dfg without a list of TMDs
    dfg = dfg.loc[df['list_of_TMDs'].notnull()].loc[
        dfg['list_of_TMDs'] != 'nan']

    # if the list_of_TMDs is a stringlist, convert to a python list
    dfg['list_of_TMDs'] = dfg['list_of_TMDs'].dropna().apply(
        lambda x: ast.literal_eval(x))

    # # for singlepass datasets, leave row blank by default
    # dfg['AAIMON_slope_central_TMDs'] = np.nan

    # CONVERT STRINGS TO FLOATS FOR SELECTED COLUMNS
    # note that after saving dfg to CSV, pandas then gets the dtype correct upon reopening for figs.py etc
    cols_to_convert = [
        "AAIMON_mean_all_TM_res", "AAIMON_n_mean_all_TM_res",
        "AAIMON_slope_all_TM_res", "AAIMON_n_slope_all_TM_res",
        'AAIMON_n_homol'
    ]
    for col in cols_to_convert:
        dfg[col] = pd.to_numeric(dfg[col])
    # print out mean AAIMON values in dataset
    mean_AAIMON_in_dataset = dfg['AAIMON_mean_all_TM_res'].mean()
    mean_AAIMON_n_in_dataset = dfg['AAIMON_n_mean_all_TM_res'].mean()
    mean_AAIMON_slope_in_dataset = dfg['AAIMON_slope_all_TM_res'].mean()
    mean_AAIMON_n_slope_in_dataset = dfg['AAIMON_n_slope_all_TM_res'].mean()
    sys.stdout.write('\n\nmean AAIMON         in dataset: {a:.05f}'
                     '\nmean AAIMON_n       in dataset: {b:.05f}'
                     '\nmean AAIMON_slope   in dataset: {c:.07f}'
                     '\nmean AAIMON_n_slope in dataset: {d:.07f}\n'.format(
                         a=mean_AAIMON_in_dataset,
                         b=mean_AAIMON_n_in_dataset,
                         c=mean_AAIMON_slope_in_dataset,
                         d=mean_AAIMON_n_slope_in_dataset))

    dfg.to_csv(pathdict["list_cr_summary_csv"],
               sep=",",
               quoting=csv.QUOTE_NONNUMERIC)

    ########################################################################################
    #                                                                                      #
    #                    Save a huge dataframe with the AAIMONs for                        #
    #                    all homologues of all TMDs of all proteins                        #
    #                                                                                      #
    ########################################################################################

    if s['save_df_characterising_each_homol_TMD']:

        # defining cutoff for max and min number of homologues for each protein
        min_num_homologues = s['min_homol']

        # filter summary file for min and max number of homologues based on TM01 number of homologues
        #sys.stdout.write('Dropped homologues after filtering: \n')
        list_of_acc_to_keep = []
        for acc in dfg.index:
            AAIMON_n_homol = dfg.loc[acc, 'AAIMON_n_homol']
            if AAIMON_n_homol > min_num_homologues:
                list_of_acc_to_keep.append(acc)

        # keep only proteins that have the desired number of homologues
        dfg = dfg.loc[list_of_acc_to_keep, :]
        df = df.loc[list_of_acc_to_keep, :]

        # # convert from string to python list
        # if isinstance(dfg['list_of_TMDs'][0], str):
        #     dfg['list_of_TMDs'] = dfg['list_of_TMDs'].dropna().apply(lambda x: ast.literal_eval(x))

        #sys.stdout.write("\nLoading data\n")
        # initiate empty numpy array
        data = np.empty([0, 3])
        # navigate through filesystem and open pickles from .zip
        n = 0
        for acc in dfg.index:
            n += 1
            if n % 20 == 0:
                sys.stdout.write('.'), sys.stdout.flush()
                if n % 600 == 0:
                    sys.stdout.write('\n'), sys.stdout.flush()
            protein_name = df.loc[acc, "protein_name"]
            homol_cr_ratios_zip = df.loc[acc, "homol_cr_ratios_zip"]

            # Here we filter to take only datapoints where all TMDs were in the alignment
            AAIMON_all_TMD = protein_name + '_AAIMON_all_TMD.csv'
            df_AAIMON_all_TMD = utils.open_df_from_csv_zip(
                homol_cr_ratios_zip,
                filename=AAIMON_all_TMD,
                delete_corrupt=False)

            ########################################################################################
            #                                                                                      #
            #       CODE COPIED FROM cons_ratio.py. Delete the following two lines after           #
            #         re-running all calculated cons ratios                                         #
            #                                                                                      #
            ########################################################################################
            # first get a list of all the homologues that have AAIMON ratios for all TMDs
            df_AAIMON_all_TMD[
                "AAIMON_avail_all_TMDs"] = df_AAIMON_all_TMD.n_TMDs_with_measurable_AAIMON == df.loc[
                    acc, "number_of_TMDs"]
            filt_index = df_AAIMON_all_TMD.loc[
                df_AAIMON_all_TMD["AAIMON_avail_all_TMDs"] ==
                True].index.tolist()
            #filt_index = [int(x) for x in filt_index]

            if not os.path.isfile(homol_cr_ratios_zip):
                # skip to next protein
                continue
            for TMD in df.loc[acc, "list_of_TMDs"]:
                # generate column names necessary for current file
                columns = [
                    'obs_changes', '{}_AAIMON'.format(TMD),
                    '{}_AAIMON_n'.format(TMD)
                ]
                # Open pickle file with conservation-ratios.
                # NOTE that these have already been filtered according to cons_ratio.py.
                # if the homologues were not acceptable, AAIMON ratios WERE NOT CALCULATED
                TM_cr_pickle = "{}_{}_cr_df.pickle".format(protein_name, TMD)
                # open dataframe  with function from korbinian, extract required columns, convert to np array
                df_TMD = utils.open_df_from_pickle_zip(homol_cr_ratios_zip,
                                                       TM_cr_pickle)
                if columns[2] not in df_TMD.columns:
                    # file is old, and should be deleted
                    #os.remove(homol_cr_ratios_zip)
                    logging.info(
                        "{} file is presumed out of date, and has been deleted"
                        .format(homol_cr_ratios_zip))
                    os.remove(homol_cr_ratios_zip)
                    # skip to next protein
                    break

                if set(filt_index).intersection(set(df_TMD.index)) == set():
                    # there is a mismatch between the filt_index for df_AAIMON_all_TMD, and the columns in df_TMD
                    # replace filt_index with empty list
                    logging.warning(
                        "Indexing Error in gather script. set(filt_index).intersection(set(df_TMD.index)) == set(). Try re-running calculate_AAIMON_ratios"
                    )
                    filt_index = []
                # use the filt_index above that shows homologues with AAIMON available for all TMDs
                df_TMD = df_TMD.loc[filt_index, :]
                # convert to numpy array
                df_TMD = df_TMD[columns].as_matrix()
                # join output data file with currently opened dataframe
                data = np.concatenate((data, df_TMD))
        # drop every row with nan
        data = data[~np.isnan(data).any(axis=1)]

        # create bins, calculate mean and confidence interval in bin - use multiprocessing if possible
        sys.stdout.write('\nBinning data - calculating confidence interval\n')
        number_of_bins = s['specify_number_of_bins_characterising_TMDs']
        # process confidence interval value to appropriate input format for function
        confidence_interval = (100 - s['CI']) / 100
        linspace_binlist = np.linspace(1, 100, number_of_bins)
        binned_data = np.empty([0, 8])
        binwidth = 100 / number_of_bins
        for percentage in linspace_binlist:
            bin_for_mean = data[(percentage >= data[:, 0])
                                & (data[:, 0] > percentage - binwidth)]
            if bin_for_mean.size != 0:
                # calculate conf. interv. in bin, alpha describes the significance level in the style 1-alpha
                conf = sms.DescrStatsW(
                    bin_for_mean[:,
                                 1]).tconfint_mean(alpha=confidence_interval)
                # calculate conf. interv. in bin _n, alpha describes the significance level in the style 1-alpha
                conf_norm = sms.DescrStatsW(
                    bin_for_mean[:,
                                 2]).tconfint_mean(alpha=confidence_interval)
                mean_data_in_bin = np.array([
                    percentage - binwidth / 2,
                    # calculate mean in bin
                    bin_for_mean[:, 1].mean(),
                    # calculate mean in bin _n
                    bin_for_mean[:, 2].mean(),
                    # add conf. interv. results to np array
                    conf[0],
                    conf[1],
                    conf_norm[0],
                    conf_norm[1],
                    # add the number of TMDs in bin to bin
                    len(bin_for_mean[:, 0])
                ])
                mean_data_in_bin = mean_data_in_bin.reshape(1, 8)
                sys.stdout.write('.'), sys.stdout.flush()
                binned_data = np.concatenate(
                    (mean_data_in_bin.reshape(1, 8), binned_data))

        # # create bins, calculate mean and confidence interval in bin - use multiprocessing if possible
        # sys.stdout.write('\nBinning data - calculating confidence interval\n')
        #
        # use_multiprocessing = s['use_multiprocessing']
        # n_processes = s['multiprocessing_cores']
        # remove_from_binlist = int((1 - s['fa_min_identity_of_full_protein']) * 100)
        # number_of_bins = s['specify_number_of_bins_characterising_TMDs']
        # confidence_interval = (100 - s['CI']) / 100
        # linspace_binlist = np.linspace(1, 100, number_of_bins)[:remove_from_binlist]
        # binned_data = np.empty([0, 8])
        # binwidth = 100 / number_of_bins
        # list_p = []
        # for percentage in linspace_binlist:
        #     data_as_dict = {'data': data, 'percentage': percentage, 'binwidth': binwidth, 'confidence_interval': confidence_interval}
        #     list_p.append(data_as_dict)
        #
        # if use_multiprocessing:
        #     with Pool(processes=n_processes) as pool:
        #             mean_data_in_bin = pool.map(binning_data_multiprocessing, list_p)
        # else:
        #     mean_data_in_bin = []
        #     for p in list_p:
        #         output = binning_data_multiprocessing(p)
        #         if type(output) is np.ndarray:
        #             mean_data_in_bin.append(output)
        #
        # for n, element in enumerate(mean_data_in_bin):
        #     if type(mean_data_in_bin[n]) is np.ndarray:
        #         binned_data = np.concatenate((mean_data_in_bin[n].reshape(1, 8), binned_data))

        # create bins, calculate mean and 95% confidence interval
        # sys.stdout.write('\nBinning data - calculating confidence interval\n')
        # confidence_interval = (100 - s['CI'])/100
        # number_of_bins = s['specify_number_of_bins_characterising_TMDs']
        # linspace_binlist = np.linspace(1, 100, number_of_bins)
        # binwidth = 100/number_of_bins
        # binned_data = np.empty([0, 8])
        # # conf_95 = np.array([1, 2])
        # # conf95_norm = np.array([1, 2])
        # for percentage in linspace_binlist:
        #     if percentage % 5 == 0:
        #         sys.stdout.write('{}%, '.format(int(percentage))), sys.stdout.flush()
        #     bin_for_mean = np.empty([0, 3])
        #     for row in data:
        #         if row[0] < percentage and row[0] > percentage - binwidth:
        #             bin_for_mean = np.concatenate((bin_for_mean, row.reshape(1, 3)))
        #     if bin_for_mean.size != 0:
        #         # calculate conf. interv. in bin, alpha describes the significance level in the style 1-alpha
        #         conf = sms.DescrStatsW(bin_for_mean[:, 1]).tconfint_mean(alpha=confidence_interval)
        #         # calculate conf. interv. in bin _n, alpha describes the significance level in the style 1-alpha
        #         conf_norm = sms.DescrStatsW(bin_for_mean[:, 2]).tconfint_mean(alpha=confidence_interval)
        #         mean_data_in_bin = np.array([percentage - binwidth/2,
        #                                      # calculate mean in bin
        #                                      bin_for_mean[:, 1].mean(),
        #                                      # calculate mean in bin _n
        #                                      bin_for_mean[:, 2].mean(),
        #                                      # add conf. interv. results to np array
        #                                      conf[0], conf[1], conf_norm[0], conf_norm[1],
        #                                      # add the number of TMDs in bin to bin
        #                                      len(bin_for_mean[:, 0])])
        #         # merge data from bin to the others
        #         binned_data = np.concatenate((mean_data_in_bin.reshape(1, 8), binned_data))
        # # drop every row containing nan in array
        # binned_data = binned_data[~np.isnan(binned_data).any(axis=1)]
        '''
        description of columns in numpy arrays:

        numpy array data:
        |       0       |   1    |    2     |
        | % obs_changes | AAIMON | AAIMON_n |

        numpy array binned_Data:
        |       0       |      1      |       2       |     3    |    4    |      5     |     6     |         7          |
        | % obs_changes | mean AAIMON | mean AAIMON_n |  CI_low  |  CI_hi  |  CI_low_n  |  CI_hi_n  | number of Proteins |
        '''
        # save data and binned_data as zipped pickle
        with zipfile.ZipFile(pathdict['save_df_characterising_each_homol_TMD'],
                             mode="w",
                             compression=zipfile.ZIP_DEFLATED) as zipout:

            # save dataframe "data_filt" as pickle
            with open('data_characterising_each_homol_TMD.pickle', "wb") as f:
                pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
            zipout.write('data_characterising_each_homol_TMD.pickle',
                         arcname='data_characterising_each_homol_TMD.pickle')
            os.remove('data_characterising_each_homol_TMD.pickle')

            # save dataframe "binned_data" as pickle
            with open('binned_data_characterising_each_homol_TMD.pickle',
                      "wb") as f:
                pickle.dump(binned_data, f, protocol=pickle.HIGHEST_PROTOCOL)
            zipout.write(
                'binned_data_characterising_each_homol_TMD.pickle',
                arcname='binned_data_characterising_each_homol_TMD.pickle')
            os.remove('binned_data_characterising_each_homol_TMD.pickle')

    logging.info(
        "\n~~~~~~~~~~~~                           finished gather_AAIMONs                      ~~~~~~~~~~~~"
    )
    return pathdict
예제 #12
0
plt.ylabel('Median value of owner-occupied homes in $1000s')
plt.show()

print('\n\nPart 2\n---------------------------')
print(
    "The null hypothesis examines the data to find how often we could get the same data randomly"
)
print(
    "Normally, we reject the null hypothesis if by random we could only get that result less than 5% of the time."
)
dfchas1 = df.MEDV[df.CHAS == 1]
dfchas0 = df.MEDV[df.CHAS == 0]
ttest, pval = stats.ttest_ind(dfchas1, dfchas0)
print('P-val: ', pval, 'ttset value', ttest)

means = ws.CompareMeans(ws.DescrStatsW(dfchas1), ws.DescrStatsW(dfchas0))
confint = means.tconfint_diff(alpha=0.05,
                              alternative='two-sided',
                              usevar='unequal')
print('Confidence interval:', confint[0], confint[1])
ratio = len(dfchas0) / len(dfchas1)
gsize = tt_ind_solve_power(effect_size=0.6,
                           nobs1=None,
                           alpha=0.05,
                           power=0.8,
                           ratio=ratio,
                           alternative='two-sided')
print(
    'Assume an effect size (Cohen’s d) of 0.6. If you want 80% power, what group size is necessary?',
    gsize)