Пример #1
0
def test_omni_normtest():
    #tests against R fBasics
    from scipy import stats

    st_pv_R = np.array(
              [[3.994138321207883, -1.129304302161460,  1.648881473704978],
               [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]])

    nt = omni_normtest(x)
    assert_almost_equal(nt, st_pv_R[:, 0], 14)

    st = stats.skewtest(x)
    assert_almost_equal(st, st_pv_R[:, 1], 14)

    kt = stats.kurtosistest(x)
    assert_almost_equal(kt, st_pv_R[:, 2], 11)

    st_pv_R = np.array(
              [[34.523210399523926,  4.429509162503833,  3.860396220444025],
               [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]])

    x2 = x**2
    #TODO: fix precision in these test with relative tolerance
    nt = omni_normtest(x2)
    assert_almost_equal(nt, st_pv_R[:, 0], 12)

    st = stats.skewtest(x2)
    assert_almost_equal(st, st_pv_R[:, 1], 12)

    kt = stats.kurtosistest(x2)
    assert_almost_equal(kt, st_pv_R[:, 2], 12)
Пример #2
0
def test_omni_normtest():
    #tests against R fBasics
    from scipy import stats
    st_pv_R = np.array(
              [[3.994138321207883, -1.129304302161460,  1.648881473704978],
               [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]])

    nt = omni_normtest(x)
    assert_almost_equal(nt, st_pv_R[:,0], 14)

    st = stats.skewtest(x)
    assert_almost_equal(st, st_pv_R[:,1], 14)

    kt = stats.kurtosistest(x)
    assert_almost_equal(kt, st_pv_R[:,2], 11)

    st_pv_R = np.array(
              [[34.523210399523926,  4.429509162503833,  3.860396220444025],
               [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]])

    x2 = x**2
    #TODO: fix precision in these test with relative tolerance
    nt = omni_normtest(x2)
    assert_almost_equal(nt, st_pv_R[:,0], 12)

    st = stats.skewtest(x2)
    assert_almost_equal(st, st_pv_R[:,1], 12)

    kt = stats.kurtosistest(x2)
    assert_almost_equal(kt, st_pv_R[:,2], 12)
Пример #3
0
def test_omni_normtest_axis(reset_randomstate):
    #test axis of omni_normtest
    x = np.random.randn(25, 3)
    nt1 = omni_normtest(x)
    nt2 = omni_normtest(x, axis=0)
    nt3 = omni_normtest(x.T, axis=1)
    assert_almost_equal(nt2, nt1, decimal=13)
    assert_almost_equal(nt3, nt1, decimal=13)
Пример #4
0
def test_omni_normtest_axis():
    #test axis of omni_normtest
    x = np.random.randn(25, 3)
    nt1 = omni_normtest(x)
    nt2 = omni_normtest(x, axis=0)
    nt3 = omni_normtest(x.T, axis=1)
    assert_almost_equal(nt2, nt1, decimal=13)
    assert_almost_equal(nt3, nt1, decimal=13)
 def homoscedasticity(self, all=False):
     # sns.scatterplot(data=self.df, x=self.indep[0], y=self.dep)
     # plt.show()
     # lev, p_lev = scipy.stats.levene(*self.dfs)  # , p>0.05 good
     self.tests.loc['omnibus'] = [*omni_normtest(self.residuals)]
     self.tests.loc['normaltest'] = [
         *scipy.stats.normaltest(self.residuals)
     ]
def regression_scores(timeseries, time_window_size, time_lag, reg, cv, scoring, timeseriesZ=None):
    """Compute regression scores for a given set of 3 timeseries
    according to the variable causality structures.
    """
    global causality_structures
    if scoring == 'residual_tests':
        features_regression = np.zeros([len(causality_structures),7])
    else:
        features_regression = np.zeros([len(causality_structures),2]) #added 2 dimensions to compute r2 and mse
    for j, (cs_train, cs_test) in enumerate(causality_structures):
        ts_train = timeseries[:,cs_train]
        if not(timeseriesZ is None):
            ts_train = np.hstack([ts_train, timeseriesZ])
        
        if time_lag is None:
            time_lag=time_window_size
        
        ts_test = timeseries[:,cs_test]
        tmp_score = np.zeros([time_window_size,2]) #added 2 dimensions to compute r2 and mse
        residuals = np.zeros(timeseries.shape[0]-time_window_size)
        for i_reg in range(time_window_size):
            idx_example = np.arange(i_reg, timeseries.shape[0]-time_lag, time_window_size)
            X = np.zeros((idx_example.size, time_window_size, ts_train.shape[1]))
            for k in range(time_window_size):
                X[:,k] = ts_train[idx_example+k]
            
            X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
            y = ts_test[idx_example + time_lag]
            if scoring == 'residual_tests':
                y_pred_i_reg = np.zeros(y.size)
                kfold = KFold(n=y.size, n_folds=cv)
                for train, test in kfold:
                    reg.fit(X[train], y[train])
                    y_pred_i_reg[test] = reg.predict(X[test])
                
                residuals[idx_example] = y - y_pred_i_reg #residuals
            else:
                tmp_predict = cross_val_predict(reg, X, y, cv=cv)
                tmp_score[i_reg,0] = r2_score(y,tmp_predict).mean() 
                tmp_score[i_reg,1] = mean_squared_error(y,tmp_predict).mean()
                #tmp_score[i_reg] = cross_val_score(reg, X, y, cv=cv, scoring=scoring).mean()
        
        if scoring == 'residual_tests':
            features_regression[j,0] = durbin_watson(residuals)
            features_regression[j,[1,2]] = omni_normtest(residuals) 
            features_regression[j,3:] = jarque_bera(residuals)
        else:
            features_regression[j] = tmp_score.mean(0)

    return features_regression
Пример #7
0
def process_linreg(x, y, metrics_dict, suffix):
    x = sm.add_constant(x)

    results = sm.OLS(y, x).fit()

    residuals = results.resid

    jb, jbpv, skew, kurtosis = jarque_bera(results.wresid)
    omni, omnipv = omni_normtest(results.wresid)

    res_mean = np.mean(residuals)
    res_std = np.std(residuals)

    _, normality_p_value_shapiro = shapiro(residuals)
    _, normality_p_value_ks_wo_params = kstest(residuals, 'norm')
    _, normality_p_value_ks_with_params = kstest(residuals, 'norm',
                                                 (res_mean, res_std))
    _, normality_p_value_dagostino = normaltest(residuals)

    metrics_dict['mean' + suffix].append(np.mean(y))
    metrics_dict['R2' + suffix].append(results.rsquared)
    metrics_dict['R2_adj' + suffix].append(results.rsquared_adj)
    metrics_dict['f_stat' + suffix].append(results.fvalue)
    metrics_dict['prob(f_stat)' + suffix].append(results.f_pvalue)
    metrics_dict['log_likelihood' + suffix].append(results.llf)
    metrics_dict['AIC' + suffix].append(results.aic)
    metrics_dict['BIC' + suffix].append(results.bic)
    metrics_dict['omnibus' + suffix].append(omni)
    metrics_dict['prob(omnibus)' + suffix].append(omnipv)
    metrics_dict['skew' + suffix].append(skew)
    metrics_dict['kurtosis' + suffix].append(kurtosis)
    metrics_dict['durbin_watson' + suffix].append(durbin_watson(
        results.wresid))
    metrics_dict['jarque_bera' + suffix].append(jb)
    metrics_dict['prob(jarque_bera)' + suffix].append(jbpv)
    metrics_dict['cond_no' + suffix].append(results.condition_number)
    metrics_dict['normality_p_value_shapiro' +
                 suffix].append(normality_p_value_shapiro)
    metrics_dict['normality_p_value_ks_wo_params' +
                 suffix].append(normality_p_value_ks_wo_params)
    metrics_dict['normality_p_value_ks_with_params' +
                 suffix].append(normality_p_value_ks_with_params)
    metrics_dict['normality_p_value_dagostino' +
                 suffix].append(normality_p_value_dagostino)
    metrics_dict['intercept' + suffix].append(results.params[0])
    metrics_dict['slope' + suffix].append(results.params[1])
    metrics_dict['intercept_std' + suffix].append(results.bse[0])
    metrics_dict['slope_std' + suffix].append(results.bse[1])
    metrics_dict['intercept_p_value' + suffix].append(results.pvalues[0])
    metrics_dict['slope_p_value' + suffix].append(results.pvalues[1])
Пример #8
0
    def __init__(self, model: x) -> None:
        self.title = model.model.__class__.__name__ + ' ' + "Regression Results"

        #top-left
        self.Dep_Variable = None
        self.Model = None
        self.Method = ['Least Squares']
        self.Date = None
        self.Time = None
        self.No_Observations = None
        self.DfResiduals = None
        self.DfModel = None

        #top-right
        self.R_squared = ["%#8.3f" % model.rsquared]
        self.Adj_R_squared = ["%#8.3f" % model.rsquared_adj]
        self.F_statistic = ["%#8.4g" % model.fvalue]
        self.Prob_F_statistic = ["%#6.3g" % model.f_pvalue]
        self.Log_Likelihood = None
        self.AIC = ["%#8.4g" % model.aic]
        self.BIC = ["%#8.4g" % model.bic]

        from statsmodels.stats.stattools import (jarque_bera, omni_normtest,
                                                 durbin_watson)
        jb, jbpv, skew, kurtosis = jarque_bera(model.wresid)
        omni, omnipv = omni_normtest(model.wresid)
        eigvals = model.eigenvals
        condno = model.condition_number
        #diagn_left
        self.Omnibus = ["%#6.3f" % omni]
        self.Prob_Omnibus = ["%#6.3f" % omnipv]
        self.Skew = ["%#6.3f" % skew]
        self.Kurtosis = ["%#6.3f" % kurtosis]

        #diagn_right
        self.Durbin_Watson = ["%#8.3f" % durbin_watson(model.wresid)]
        self.JarqueBera_JB = ["%#8.3f" % jb]
        self.Prob_JB = ["%#8.3g" % jbpv]
        self.Cond_No = ["%#8.3g" % condno]
Пример #9
0
def linear_new(types, intput):
    np.random.seed(9876789)

    df = pd.read_csv(intput, index_col=False)
    print(df)
    print(df.columns[:-1])
    feature = df.columns[:-1]
    s1 = ' + '.join(feature)
    s2 = df.columns[-1]
    s = s2 + " ~ " + s1

    if types == "ols":
        results = smf.ols(s, data=df).fit(use_t=True)
    elif types == "gls":
        results = smf.gls(s, data=df).fit(use_t=True)
    elif types == "glsar":
        results = smf.glsar(s, data=df).fit(use_t=True)
    elif types == "wls":
        results = smf.wls(s, data=df).fit(use_t=True)
    else:
        print("No this type!!!")
        exit(0)

    print(
        "**********************************************************************************\n"
    )
    alpha = 0.05
    print(results.summary())

    data_t = {
        "coef": results.params,
        "std err": results.bse,
        "t": results.tvalues,
        "P>|t|": results.pvalues,
        "[" + str(alpha / 2.0): results.conf_int(alpha)[0],
        str(1 - alpha / 2.0) + "]": results.conf_int(alpha)[1]
    }

    sdata_df = pd.DataFrame(data_t)
    print(sdata_df)
    sdata_df.to_csv("out/data1.csv")

    from statsmodels.stats.stattools import (jarque_bera, omni_normtest,
                                             durbin_watson)

    jb, jbpv, skew, kurtosis = jarque_bera(results.wresid)
    omni, omnipv = omni_normtest(results.wresid)

    title = [
        "Model", "R-squared", "Adj. R-squared", "F-statistic",
        "Prob (F-statistic)", "Log-Likelihood", "AIC", "BIC", "Omnibus",
        "Prob(Omnibus)", "Skew", "Kurtosis", "Durbin-Watson",
        "Jarque-Bera (JB)", "Prob(JB)", "Cond. No."
    ]

    value = [
        results.model.__class__.__name__, results.rsquared,
        results.rsquared_adj, results.fvalue, results.f_pvalue, results.llf,
        results.aic, results.bic, omni, omnipv, skew, kurtosis,
        durbin_watson(results.wresid), jb, jbpv, results.diagn['condno']
    ]

    datadf = {"title": np.array(title), "value": np.array(value)}

    select_df = pd.DataFrame(datadf)
    print(select_df)
    select_df.to_csv("out/data2.csv")

    # 画1D或者3D图形
    predicted = results.predict(df)
    import matplotlib.pyplot as plt
    if len(feature) == 1:
        x = np.array(df[feature]).reshape(-1, 1)
        y = np.array(df[s2]).reshape(-1, 1)
        plt.figure(facecolor='white', figsize=(10, 5))
        plt.scatter(x, y, marker='x')
        plt.plot(x, predicted, c='r')

        title = 'The  Linear Graph of One Dimension'
        # 绘制x轴和y轴坐标
        plt.xlabel(feature[0])
        plt.ylabel(s2)
        plt.title(title)
        plt.grid()
        plt.savefig("out/plot_out.png", format='png')

    elif len(feature) == 2:
        from mpl_toolkits.mplot3d import Axes3D
        ax1 = plt.axes(projection='3d')

        x = np.array(df[feature[0]]).reshape(-1, 1)
        y = np.array(df[feature[1]]).reshape(-1, 1)
        z = np.array(df[s2]).reshape(-1, 1)
        ax1.scatter3D(x, y, z, cmap='Blues')  # 绘制散点图
        ax1.plot3D(x, y, predicted, 'gray')  # 绘制空间曲线
        ax1.set_xlabel(feature[0])
        ax1.set_ylabel(feature[1])
        ax1.set_zlabel(s2)
        plt.savefig("out/plot_out.png", format='png')
    else:
        print("The number of feature is big than 2 ,no plot!")

    return
Пример #10
0
    def summary(self, yname=None, xname=None, title=None, alpha=.05):
        """Summarize the Regression Results

        Parameters
        -----------
        yname : string, optional
            Default is `y`
        xname : list of strings, optional
            Default is `var_##` for ## in p the number of regressors
        title : string, optional
            Title for the top table. If not None, then this replaces the
            default title
        alpha : float
            significance level for the confidence intervals

        Returns
        -------
        smry : Summary instance
            this holds the summary tables and text, which can be printed or
            converted to various output formats.

        See Also
        --------
        statsmodels.iolib.summary.Summary : class to hold summary
            results

        """

        #TODO: import where we need it (for now), add as cached attributes
        from statsmodels.stats.stattools import (jarque_bera,
                omni_normtest, durbin_watson)
        jb, jbpv, skew, kurtosis = jarque_bera(self.wresid)
        omni, omnipv = omni_normtest(self.wresid)

        eigvals = self.eigenvals
        condno = self.condition_number

        self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis,
                          omni=omni, omnipv=omnipv, condno=condno,
                          mineigval=eigvals[0])

        top_left = [('Dep. Variable:', None),
                    ('Model:', None),
                    ('Method:', ['Least Squares']),
                    ('Date:', None),
                    ('Time:', None)
                    ]

        top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
                     ('Bandwidth:', ["%#8.4g" % self.bandwidth]),
                     ('Sparsity:', ["%#8.4g" % self.sparsity]),
                     ('No. Observations:', None),
                     ('Df Residuals:', None), #[self.df_resid]), #TODO: spelling
                     ('Df Model:', None) #[self.df_model])
                    ]

        diagn_left = [('Omnibus:', ["%#6.3f" % omni]),
                      ('Prob(Omnibus):', ["%#6.3f" % omnipv]),
                      ('Skew:', ["%#6.3f" % skew]),
                      ('Kurtosis:', ["%#6.3f" % kurtosis])
                      ]

        diagn_right = [('Durbin-Watson:', ["%#8.3f" % durbin_watson(self.wresid)]),
                       ('Jarque-Bera (JB):', ["%#8.3f" % jb]),
                       ('Prob(JB):', ["%#8.3g" % jbpv]),
                       ('Cond. No.', ["%#8.3g" % condno])
                       ]


        if title is None:
            title = self.model.__class__.__name__ + ' ' + "Regression Results"

        #create summary table instance
        from statsmodels.iolib.summary import Summary
        smry = Summary()
        smry.add_table_2cols(self, gleft=top_left, gright=top_right,
                          yname=yname, xname=xname, title=title)
        smry.add_table_params(self, yname=yname, xname=xname, alpha=.05,
                             use_t=True)

#        smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right,
                          #yname=yname, xname=xname,
                          #title="")

        #add warnings/notes, added to text format only
        etext = []
        if eigvals[-1] < 1e-10:
            wstr = "The smallest eigenvalue is %6.3g. This might indicate "
            wstr += "that there are\n"
            wstr += "strong multicollinearity problems or that the design "
            wstr += "matrix is singular."
            wstr = wstr % eigvals[-1]
            etext.append(wstr)
        elif condno > 1000:  #TODO: what is recommended
            wstr = "The condition number is large, %6.3g. This might "
            wstr += "indicate that there are\n"
            wstr += "strong multicollinearity or other numerical "
            wstr += "problems."
            wstr = wstr % condno
            etext.append(wstr)

        if etext:
            smry.add_extra_txt(etext)

        return smry
Пример #11
0
    def summary(self, yname=None, xname=None, title=None, alpha=.05):
        """Summarize the Regression Results

        Parameters
        -----------
        yname : string, optional
            Default is `y`
        xname : list of strings, optional
            Default is `var_##` for ## in p the number of regressors
        title : string, optional
            Title for the top table. If not None, then this replaces the
            default title
        alpha : float
            significance level for the confidence intervals

        Returns
        -------
        smry : Summary instance
            this holds the summary tables and text, which can be printed or
            converted to various output formats.

        See Also
        --------
        statsmodels.iolib.summary.Summary : class to hold summary
            results

        """

        #TODO: import where we need it (for now), add as cached attributes
        from statsmodels.stats.stattools import (jarque_bera, omni_normtest,
                                                 durbin_watson)
        jb, jbpv, skew, kurtosis = jarque_bera(self.wresid)
        omni, omnipv = omni_normtest(self.wresid)

        eigvals = self.eigenvals
        condno = self.condition_number

        self.diagn = dict(jb=jb,
                          jbpv=jbpv,
                          skew=skew,
                          kurtosis=kurtosis,
                          omni=omni,
                          omnipv=omnipv,
                          condno=condno,
                          mineigval=eigvals[0])

        top_left = [('Dep. Variable:', None), ('Model:', None),
                    ('Method:', ['Least Squares']), ('Date:', None),
                    ('Time:', None)]

        top_right = [
            ('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
            ('Bandwidth:', ["%#8.4g" % self.bandwidth]),
            ('Sparsity:', ["%#8.4g" % self.sparsity]),
            ('No. Observations:', None),
            ('Df Residuals:', None),  #[self.df_resid]), #TODO: spelling
            ('Df Model:', None)  #[self.df_model])
        ]

        diagn_left = [('Omnibus:', ["%#6.3f" % omni]),
                      ('Prob(Omnibus):', ["%#6.3f" % omnipv]),
                      ('Skew:', ["%#6.3f" % skew]),
                      ('Kurtosis:', ["%#6.3f" % kurtosis])]

        diagn_right = [('Durbin-Watson:',
                        ["%#8.3f" % durbin_watson(self.wresid)]),
                       ('Jarque-Bera (JB):', ["%#8.3f" % jb]),
                       ('Prob(JB):', ["%#8.3g" % jbpv]),
                       ('Cond. No.', ["%#8.3g" % condno])]

        if title is None:
            title = self.model.__class__.__name__ + ' ' + "Regression Results"

        #create summary table instance
        from statsmodels.iolib.summary import Summary
        smry = Summary()
        smry.add_table_2cols(self,
                             gleft=top_left,
                             gright=top_right,
                             yname=yname,
                             xname=xname,
                             title=title)
        smry.add_table_params(self,
                              yname=yname,
                              xname=xname,
                              alpha=.05,
                              use_t=True)

        #        smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right,
        #yname=yname, xname=xname,
        #title="")

        #add warnings/notes, added to text format only
        etext = []
        if eigvals[-1] < 1e-10:
            wstr = "The smallest eigenvalue is %6.3g. This might indicate "
            wstr += "that there are\n"
            wstr += "strong multicollinearity problems or that the design "
            wstr += "matrix is singular."
            wstr = wstr % eigvals[-1]
            etext.append(wstr)
        elif condno > 1000:  #TODO: what is recommended
            wstr = "The condition number is large, %6.3g. This might "
            wstr += "indicate that there are\n"
            wstr += "strong multicollinearity or other numerical "
            wstr += "problems."
            wstr = wstr % condno
            etext.append(wstr)

        if etext:
            smry.add_extra_txt(etext)

        return smry
Пример #12
0
    def single(self, item, config, configs_child):

        if config.experiment.data in [DataType.betas, DataType.betas_adj, DataType.residuals_common,
                                      DataType.residuals_special]:

            if config.experiment.method == Method.linreg:

                targets = self.get_strategy.get_target(config)
                x = sm.add_constant(targets)
                y = self.get_strategy.get_single_base(config, [item])[0]

                results = sm.OLS(y, x).fit()

                y = results.resid

                jb, jbpv, skew, kurtosis = jarque_bera(results.wresid)
                omni, omnipv = omni_normtest(results.wresid)

                res_mean = np.mean(y)
                res_std = np.std(y)

                _, normality_p_value_shapiro = shapiro(y)
                _, normality_p_value_ks_wo_params = kstest(y, 'norm')
                _, normality_p_value_ks_with_params = kstest(y, 'norm', (res_mean, res_std))
                _, normality_p_value_dagostino = normaltest(y)

                config.metrics['item'].append(item)
                aux = self.get_strategy.get_aux(config, item)
                config.metrics['aux'].append(aux)
                config.metrics['R2'].append(results.rsquared)
                config.metrics['R2_adj'].append(results.rsquared_adj)
                config.metrics['f_stat'].append(results.fvalue)
                config.metrics['prob(f_stat)'].append(results.f_pvalue)
                config.metrics['log_likelihood'].append(results.llf)
                config.metrics['AIC'].append(results.aic)
                config.metrics['BIC'].append(results.bic)
                config.metrics['omnibus'].append(omni)
                config.metrics['prob(omnibus)'].append(omnipv)
                config.metrics['skew'].append(skew)
                config.metrics['kurtosis'].append(kurtosis)
                config.metrics['durbin_watson'].append(durbin_watson(results.wresid))
                config.metrics['jarque_bera'].append(jb)
                config.metrics['prob(jarque_bera)'].append(jbpv)
                config.metrics['cond_no'].append(results.condition_number)
                config.metrics['normality_p_value_shapiro'].append(normality_p_value_shapiro)
                config.metrics['normality_p_value_ks_wo_params'].append(normality_p_value_ks_wo_params)
                config.metrics['normality_p_value_ks_with_params'].append(normality_p_value_ks_with_params)
                config.metrics['normality_p_value_dagostino'].append(normality_p_value_dagostino)
                config.metrics['intercept'].append(results.params[0])
                config.metrics['slope'].append(results.params[1])
                config.metrics['intercept_std'].append(results.bse[0])
                config.metrics['slope_std'].append(results.bse[1])
                config.metrics['intercept_p_value'].append(results.pvalues[0])
                config.metrics['slope_p_value'].append(results.pvalues[1])

            elif config.experiment.method == Method.cluster:

                x = self.get_strategy.get_target(config)
                x_normed = normalize_to_0_1(x)
                y = self.get_strategy.get_single_base(config, [item])[0]
                y_normed = normalize_to_0_1(y)

                min_samples = max(1, int(config.experiment.method_params['min_samples_percentage'] * len(x) / 100.0))

                X = np.array([x_normed, y_normed]).T
                db = DBSCAN(eps=config.experiment.method_params['eps'], min_samples=min_samples).fit(X)
                core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
                core_samples_mask[db.core_sample_indices_] = True
                labels = db.labels_
                number_of_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                number_of_noise_points = list(labels).count(-1)

                config.metrics['item'].append(item)
                config.metrics['aux'].append(self.get_strategy.get_aux(config, item))
                config.metrics['number_of_clusters'].append(number_of_clusters)
                config.metrics['number_of_noise_points'].append(number_of_noise_points)

            elif config.experiment.method == Method.polygon:

                metrics_keys = get_method_metrics_keys(config)
                for config_child in configs_child:
                    item_id = config_child.advanced_dict[item]
                    for key in config_child.advanced_data:
                        if key not in metrics_keys:
                            advanced_data = config_child.advanced_data[key][item_id]
                            suffix = str(config_child.attributes.observables)
                            if suffix != '' and suffix not in key:
                                key += '_' + suffix
                            config.metrics[key].append(advanced_data)
                            metrics_keys.append(key)

                if config.experiment.method_params['method'] == Method.linreg:

                    polygons_region = []
                    polygons_slope = []
                    polygons_region_min = []
                    max_abs_slope = 0.0
                    is_inside = False

                    mins = [min(self.get_strategy.get_target(config_child)) for config_child in configs_child]
                    maxs = [max(self.get_strategy.get_target(config_child)) for config_child in configs_child]
                    border_l = max(mins)
                    border_r = min(maxs)
                    if border_l > border_r:
                        raise ValueError('Polygons borders are not consistent')

                    for config_child in configs_child:
                        targets = self.get_strategy.get_target(config_child)
                        item_id = config_child.advanced_dict[item]

                        intercept = config_child.advanced_data['intercept'][item_id]
                        slope = config_child.advanced_data['slope'][item_id]
                        intercept_std = config_child.advanced_data['intercept_std'][item_id]
                        slope_std = config_child.advanced_data['slope_std'][item_id]

                        pr = PolygonRoutines(
                            x=targets,
                            y=[],
                            params={
                                'intercept': intercept,
                                'slope': slope,
                                'intercept_std': intercept_std,
                                'slope_std': slope_std
                            },
                            method=config_child.experiment.method
                        )
                        points_region = pr.get_border_points()

                        points_slope = [
                            geometry.Point(slope - 3.0 * slope_std, 0.0),
                            geometry.Point(slope + 3.0 * slope_std, 0.0),
                            geometry.Point(slope + 3.0 * slope_std, 1.0),
                            geometry.Point(slope - 3.0 * slope_std, 1.0),
                        ]

                        max_abs_slope = max(max_abs_slope, abs(slope))

                        pr_min = PolygonRoutines(
                            x=[border_l, border_r],
                            y=[],
                            params={
                                'intercept': intercept,
                                'slope': slope,
                                'intercept_std': intercept_std,
                                'slope_std': slope_std
                            },
                            method=config_child.experiment.method
                        )
                        points_region_min = pr_min.get_border_points()

                        polygon = geometry.Polygon([[point.x, point.y] for point in points_region])
                        polygons_region.append(polygon)

                        polygon = geometry.Polygon([[point.x, point.y] for point in points_slope])
                        polygons_slope.append(polygon)

                        polygon = geometry.Polygon([[point.x, point.y] for point in points_region_min])
                        polygons_region_min.append(polygon)

                    intersection = polygons_region[0]
                    union = polygons_region[0]
                    for polygon in polygons_region[1::]:
                        intersection = intersection.intersection(polygon)
                        union = union.union(polygon)
                    area_intersection_rel = intersection.area / union.area

                    union = polygons_region_min[0]
                    for polygon in polygons_region_min[1::]:
                        union = union.union(polygon)
                    for polygon in polygons_region_min:
                        if union.area == polygon.area:
                            is_inside = True

                    intersection = polygons_slope[0]
                    union = polygons_slope[0]
                    for polygon in polygons_slope[1::]:
                        intersection = intersection.intersection(polygon)
                        union = union.union(polygon)
                    slope_intersection_rel = intersection.area / union.area

                    config.metrics['item'].append(item)
                    aux = self.get_strategy.get_aux(config, item)
                    config.metrics['aux'].append(aux)
                    config.metrics['area_intersection_rel'].append(area_intersection_rel)
                    config.metrics['slope_intersection_rel'].append(slope_intersection_rel)
                    config.metrics['max_abs_slope'].append(max_abs_slope)
                    config.metrics['is_inside'].append(is_inside)

                elif config.experiment.method_params['method'] == Method.variance:

                    polygons_region_box = []

                    for config_child in configs_child:

                        targets = self.get_strategy.get_target(config_child)
                        data = self.get_strategy.get_single_base(config_child, [item])
                        targets = np.squeeze(np.asarray(targets))
                        data = np.squeeze(np.asarray(data))

                        exog = sm.add_constant(targets)
                        endog = data
                        results = sm.OLS(endog, exog).fit()
                        residuals = results.resid

                        semi_window = config_child.experiment.method_params['semi_window']
                        box_b = config_child.experiment.method_params['box_b']
                        box_t = config_child.experiment.method_params['box_t']

                        box_xs, box_bs, box_ms, box_ts = process_box(targets, residuals, semi_window, box_b, box_t)
                        points_box = []
                        for p_id in range(0, len(box_xs)):
                            points_box.append(geometry.Point(
                                box_xs[p_id],
                                box_ts[p_id]
                            ))
                        for p_id in range(len(box_xs) - 1, -1, -1):
                            points_box.append(geometry.Point(
                                box_xs[p_id],
                                box_bs[p_id]
                            ))
                        polygon = geometry.Polygon([[point.x, point.y] for point in points_box])
                        polygons_region_box.append(polygon)

                    intersection_box = polygons_region_box[0]
                    union_box = polygons_region_box[0]
                    for polygon in polygons_region_box[1::]:
                        intersection_box = intersection_box.intersection(polygon)
                        union_box = union_box.union(polygon)
                    area_intersection_rel_box = intersection_box.area / union_box.area

                    config.metrics['item'].append(item)
                    aux = self.get_strategy.get_aux(config, item)
                    config.metrics['aux'].append(aux)
                    config.metrics['area_intersection_rel_box'].append(area_intersection_rel_box)

            elif config.experiment.method == Method.z_test_linreg:

                slopes = []
                slopes_std = []
                num_subs = []

                metrics_keys = get_method_metrics_keys(config)

                for config_child in configs_child:

                    item_id = config_child.advanced_dict[item]

                    for key in config_child.advanced_data:
                        if key not in metrics_keys:
                            advanced_data = config_child.advanced_data[key][item_id]
                            suffix = str(config_child.attributes.observables)
                            if suffix != '' and suffix not in key:
                                key += '_' + suffix
                            config.metrics[key].append(advanced_data)
                            metrics_keys.append(key)

                    slopes.append(config_child.advanced_data['slope'][item_id])
                    slopes_std.append(config_child.advanced_data['slope_std'][item_id])
                    num_subs.append(len(config_child.attributes_dict['age']))

                std_errors = [slopes_std[i] / np.sqrt(num_subs[i]) for i in range(0, len(slopes_std))]
                z_value = (slopes[0] - slopes[1]) / np.sqrt(sum([std_error * std_error for std_error in std_errors]))
                p_value = norm.sf(abs(z_value)) * 2.0

                config.metrics['item'].append(item)
                aux = self.get_strategy.get_aux(config, item)
                config.metrics['aux'].append(aux)
                config.metrics['z_value'].append(z_value)
                config.metrics['p_value'].append(p_value)
                config.metrics['abs_z_value'].append(np.absolute(z_value))

            elif config.experiment.method == Method.aggregator:

                metrics_keys = get_method_metrics_keys(config)

                for config_child in configs_child:

                    item_id = config_child.advanced_dict[item]

                    for key in config_child.advanced_data:
                        if key not in metrics_keys:
                            advanced_data = config_child.advanced_data[key][item_id]
                            suffix = str(config_child.attributes.observables)
                            if suffix != '' and suffix not in key:
                                key += '_' + suffix
                            config.metrics[key].append(advanced_data)
                            metrics_keys.append(key)

                config.metrics['item'].append(item)
                aux = self.get_strategy.get_aux(config, item)
                config.metrics['aux'].append(aux)

            elif config.experiment.method == Method.variance:

                targets = self.get_strategy.get_target(config)
                data = self.get_strategy.get_single_base(config, [item])
                targets = np.squeeze(np.asarray(targets))
                data = np.squeeze(np.asarray(data))

                semi_window = config.experiment.method_params['semi_window']
                box_b = config.experiment.method_params['box_b']
                box_t = config.experiment.method_params['box_t']

                xs, bs, ms, ts = process_box(targets, data, semi_window, box_b, box_t)
                variance_processing(xs, bs, config.metrics, 'box_b')
                variance_processing(xs, ms, config.metrics, 'box_m')
                variance_processing(xs, ts, config.metrics, 'box_t')

                R2 = np.min([config.metrics['box_b_best_R2'][-1], config.metrics['box_t_best_R2'][-1]])

                config.metrics['best_R2'].append(R2)

                config.metrics['item'].append(item)
                aux = self.get_strategy.get_aux(config, item)
                config.metrics['aux'].append(aux)
    def check_error_term_normality(self) -> bool:
        """
        Checks if the distribution of the error term is normal by:
        - Shapiro-Wilk's normality test,
        - Jarque-Bera's normality test,
        - Omnibus' normality test,
        - Kolmogorov-Smirnov's normality test,
        - Q-Q plot.
        If:
         - silent_mode = True, method returns:
                                              a) True (which means that the assumption is
                                                 fulfilled) if the percentage of statistical tests
                                                 for which the assumption is fulfilled is higher
                                                 than or equal to set min_fulfill_ratio
                                              b) False (which means that the assumption is not
                                                 fulfilled) if the percentage of statistical tests
                                                 for which the assumption is fulfilled is lower
                                                 than set min_fulfill_ratio
         - silent_mode = False, method returns True/False as above and shows additional statistics,
         descriptions which are helpful in assessing the fulfilment of assumption
        """

        sw = stats.shapiro(self.residuals)
        jb = stats.jarque_bera(self.residuals)
        om = omni_normtest(self.residuals)
        ks = stats.kstest(self.residuals, "norm")
        ad = stats.anderson(self.residuals, dist="norm")
        normality_tests_names = [
            "Shapiro-Wilk", "Jarque-Bera", "Omnibus", "Kolmogorov-Smirnov"
        ]
        normality_tests = [sw, jb, om, ks, ad]
        tests = zip(normality_tests_names, normality_tests)

        if not self.silent_mode:
            print(
                Color.BOLD +
                "Assumption 7. The error term is normally distributed." +
                Color.END, "\n")
            print("This assumption affects on: \n", "- interpretation \n")

            print(
                "REMARK: For datasets with sufficiently large sample size, the normality of "
                "errors distribution comes from Central Limit Theorem.\n")

            print(
                "OLS does not require that the error term follows a normal distribution to "
                "produce unbiased estimates with the minimum variance. However, satisfying this "
                "assumption allows you to perform statistical hypothesis testing and generate "
                "reliable confidence intervals and prediction intervals. \n")

            print(
                "Statistical tests for checking normality of the error term distribution: \n"
            )

            true_counts = 0

            for test in tests:
                print(
                    Color.BOLD + f"{test[0]}: " + Color.END +
                    f"test statistic: {test[1][0]:.4f}, p-value: {test[1][1]}")

                true_counts = true_counts + test_hypothesis(
                    self.alpha,
                    p_value=test[1][1],
                    null_hypothesis="the error term is normally distributed")

            true_ratio = true_counts / len(normality_tests)

            check_fulfill_ratio(true_fulfill_ratio=true_ratio,
                                min_fulfill_ratio=self.min_fulfill_ratio)

            print("Q-Q (quantile-quantile) plot: \n")
            print(
                "HINT: If error term's distribution is similar to normal distribution, the points "
                "in the Q–Q plot will approximately lie on the line y = x")
            sm.qqplot(self.residuals, line='s')
            plt.show()
        else:
            true_counts = 0

            for test in tests:
                true_counts = true_counts + test_hypothesis(
                    self.alpha, p_value=test[1][1], print_outcome=False)

            true_ratio = true_counts / len(normality_tests)

        return check_fulfill_ratio(true_fulfill_ratio=true_ratio,
                                   min_fulfill_ratio=self.min_fulfill_ratio,
                                   print_outcome=False)
Пример #14
0
np.savetxt('phenotypic_age_log10_deci.txt', phenotypic_age, fmt='%.2f')
np.savetxt('mortality_score_1_year_log10_deci.txt',
           mortality_score_1_year,
           fmt='%.2f')
np.savetxt('mortality_score_2_year_log10_deci.txt',
           mortality_score_2_year,
           fmt='%.2f')

x = sm.add_constant(data_dict['Age'])

results = sm.OLS(phenotypic_age, x).fit()

residuals = results.resid

jb, jbpv, skew, kurtosis = jarque_bera(results.wresid)
omni, omnipv = omni_normtest(results.wresid)

res_mean = np.mean(residuals)
res_std = np.std(residuals)

_, normality_p_value_shapiro = shapiro(residuals)
_, normality_p_value_dagostino = normaltest(residuals)

metrics_dict = {}
metrics_dict['R2'] = results.rsquared
metrics_dict['R2_adj'] = results.rsquared_adj
metrics_dict['f_stat'] = results.fvalue
metrics_dict['prob(f_stat)'] = results.f_pvalue
metrics_dict['log_likelihood'] = results.llf
metrics_dict['AIC'] = results.aic
metrics_dict['BIC'] = results.bic
Пример #15
0
$\text{H}_0$:正規分布である

$\text{H}_A$:$\text{H}_0$は成立しない

BJ検定と同じように,正規性の判断には歪度(わいど;Skewness)と尖度(せんど;Kurtosis)に基づいている。

---
`statsmodels`のサブパッケージの一部として含まれている。

`data_norm`を使って試してみる。

<返り値>
* テスト統計量
* $p$値

omni_normtest(data_norm)

$p$値は高いため,10%有意水準でも$\text{H}_0$を棄却できない。

---
次に`data_uniform`を試してみよう。

$p$値は非常に小さいため,1%有意水準でも$\text{H}_0$を棄却できる。

omni_normtest(data_uniform)

---
上で行った2つの回帰分析の結果を検定してみよう。

omni_normtest(res_wage.resid)