def get_stat_overview(y):
    return pd.Series(
        data=[
            y.mean(),
            y.median(),
            y.max(),
            y.min(),
            y.std(),
            y.skew(),
            y.kurtosis(),
            jarque_bera(y)[0],
            jarque_bera(y)[1],
        ],
        index=[
            "Mean",
            "Median",
            "Max",
            "Min",
            "Std",
            "Skewness",
            "Kurtosis",
            "Jarque_Bera",
            "Jarque_Bera_p",
        ],
    )
示例#2
0
def jb(x, test=True):
    '''
    the lower the best
    '''
    np.random.seed(12345678)
    if test: return stats.jarque_bera(x)[0]
    return stats.jarque_bera(x)[1]
def diagnostic(x,y,sig_lv=sig_lv,cor=correlation):
    rslt={}
    p_x=[ss.jarque_bera(x.iloc[:,i]) for i in range(len(x.columns))]
    n_test_x=['Reject_H0' if p_x[i][1]<sig_lv else 'Not_Reject_H0' for i in range(len(p_x))]
    p_y=[ss.jarque_bera(y.iloc[:,i]) for i in range(len(y.columns))]
    n_test_y=['Reject_H0' if p_y[i][1]<sig_lv else 'Not_Reject_H0' for i in range(len(p_y))]
    corre=x.corr()
    rslt['mutli']=[{x.columns[i]+' '+x.columns[j]:str(np.absolute(corre.iloc[i,j])>0.5)} for i in range(len(x.columns)-1) for j in range(i+1,len(x.columns))]
    rslt['JB_Test']=[{x.columns[i]:{'p':p_x[i][1],'rst':n_test_x[i]}} for i in range(len(x.columns))]+[{y.columns[i]:{'p':p_y[i][1],'rst':n_test_y[i]}} for i in range(len(y.columns))]
    return rslt
    def norm_cal(self, x):
        '''Calculate the normality of a single variable x. 
        
        Parameters:
        ----------
        x : numpy.ndarray
        
        Returns:
        -------
        x_res : dict
            'Statistic': statistic value calculated by the test
            'Pvalue':  p-value calculated by the test
            'Critical': critical value if Anderson-Darling is used
            'Test': name of the test used
            'Sample size': sample size of the variable 
            'Result': bool, True if p-value < .5, False otherwise
        
        Notes:
        -----
        More conservative cutoff numbers of 3500 and 50 are chosen 
        based on below test conventions:
        Jarque_bera requires 2000+ samples;
        Shapiro-Wilk is accurate under 5000;
        And common difinition of small sample size is 30'''

        x_res = {}
        if len(x) >= 3500:  # Use Jarque_bera for samples larger 3500
            x_res['Statistic'] = ss.jarque_bera(x)[0]
            x_res['Pvalue'] = ss.jarque_bera(x)[1]
            x_res['Test'] = 'Jarque Bera Test'
            x_res['Sample Size'] = x.shape
        elif len(x) >= 50:  # Use Shapiro-Wilk for samples  [50  3500)
            x_res['Statistic'] = ss.shapiro(x)[0]
            x_res['Pvalue'] = ss.shapiro(x)[1]
            x_res['Test'] = 'Shapiro-Wilk Test'
            x_res['Sample Size'] = x.shape
        else:  # Use Anderson-Darling for samples less than 50
            x_res['Statistic'] = ss.anderson(x)[0][2]
            x_res['Critical'] = ss.anderson(x)[1][2]
            x_res['Test'] = 'Aderson-Darling Test'
            x_res['Sample Size'] = x.shape

        if x_res['Test'] != 'Aderson-Darling Test':
            if x_res['Pvalue'] < .05:  # Fixed significance level
                x_res['Result'] = False
            else:
                x_res['Result'] = True
        else:  # Anderson-Darling result has to be specially handled
            if x_res['Critical'] < x_res['Statistic']:
                x_res['Result'] = False
            else:
                x_res['Result'] = True

        return x_res
示例#5
0
def compute_jarque_bera(arr):
    """
    H_0 : distribution is normal at 99% confidence level
    H_1 : distribution is not normal at 99% confidence level

    - checks whether a distribution has skewness and kurtosis values matching that of a normal distribution
    - result is a non-negative value - the farther from zero, the greater it deviates from normal distribution
    """
    value = jarque_bera(arr)[0]
    p_value = jarque_bera(arr)[1]
    print("The Jarque-Bera test statistic value is", value,
          "with probability of", p_value)
示例#6
0
 def verificar_distribuicao_normal(self, arr, p_value=0.05):
     """
     Função responsavel por verificar a normalidade de uma distribuição utilizando o método jarque bera.
     Sendo assim rejeitar_h0 sendo `False` então a distribuição é normal
     Parâmetros:
         arr: list or array
         p-value: float (Nível de significância padrão p_value=0.05)
     Retornos:
         rejeitar_h0: bool
     """
     return (sct.jarque_bera(arr)[1],
             bool(sct.jarque_bera(arr)[1] >= p_value))
def test_calcula_pvalue():
    # No podemos afirmar que FRECUENCIAS es normal
    jb_frec = calcula_jarque_bera(FRECUENCIAS)
    print("Pvalue - Frecuencias: ", calcula_pvalue(jb_frec, valores_chi2_2, pvalues_chi2_2))
    # No podemos afirmar que PESOS es normal
    jb_peso = calcula_jarque_bera(PESOS)
    print("Pvalue - Pesos: ", calcula_pvalue(jb_peso, valores_chi2_2, pvalues_chi2_2))
    # Podemos afirmar que SINTETICA es normal con un nivel de significación de 0.95
    jb_sint = calcula_jarque_bera(SINTETICA)
    print("Pvalue - Sintética: ", calcula_pvalue(jb_sint, valores_chi2_2, pvalues_chi2_2))
    # Cálculo del estadístico Jarque-Bera y el pvalue con la implementación de Scipy
    print("JarqueBera/Pvalue (Scipy) - Frecuencias: ", jarque_bera(FRECUENCIAS))
    print("JarqueBera/Pvalue (Scipy) - Pesos: ", jarque_bera(PESOS))
    print("JarqueBera/Pvalue (Scipy) - Sintética: ", jarque_bera(SINTETICA))
示例#8
0
def jb_calculation(symbolIdx, filesIdx):
    # add location variables. these i have to figure how to abstract away
    print(symbols[symbolIdx])
    procsdSymbolFolder = os.path.join(elements, symbols[symbolIdx])
    print(procsdSymbolFolder)
    files = sorted(os.listdir(procsdSymbolFolder))

    fileLocation = os.path.join(procsdSymbolFolder, files[filesIdx])
    print(fileLocation)

    # pick the various files
    volume_bar_dict = open_pickle_filepath(fileLocation)[bars[0]]
    calendar_bar_dict = open_pickle_filepath(fileLocation)[bars[1]]
    usd_volume_bar_dict = open_pickle_filepath(fileLocation)[bars[2]]
    tick_bar_dict = open_pickle_filepath(fileLocation)[bars[3]]
    # get the dataframes

    volume_bar_df = volume_bar_dict[list(volume_bar_dict.keys())[0]]
    calendar_bar_df = calendar_bar_dict[list(calendar_bar_dict.keys())[0]]
    usd_volume_df = usd_volume_bar_dict[list(usd_volume_bar_dict.keys())[0]]
    tick_bar_df = tick_bar_dict[list(usd_volume_bar_dict.keys())[0]]
    # returns

    vb_ret = returns(volume_bar_df.micro_price_close).replace(
        [np.inf, -np.inf], 0)  # volume
    tb_ret = returns(tick_bar_df.micro_price_close).replace([np.inf, -np.inf],
                                                            0)  # tick
    usdvb_ret = returns(usd_volume_df.micro_price_close).dropna().replace(
        [np.inf, -np.inf], 0)  # usd volume
    cb_ret = returns(calendar_bar_df.micro_price_close).dropna().replace(
        [np.inf, -np.inf], 0)  # calendar
    # calculating JB statistic
    jb_value_tick, _ = jarque_bera(tb_ret)
    jb_value_vol, _ = jarque_bera(vb_ret)
    jb_value_dollar, _ = jarque_bera(usdvb_ret)
    jb_value_calendar, _ = jarque_bera(cb_ret)

    jb_test_df = pd.DataFrame(data={
        'jarque_bera_results':
        [jb_value_tick, jb_value_vol, jb_value_dollar, jb_value_calendar]
    },
                              index=['tick', 'vol', 'dollar', 'calendar'])
    pickle_out_returns = os.path.join(
        experimentsLocation, "".join(
            (str(symbols[symbolIdx]), "_" + str(filesIdx) + "_jb_stats.pkl")))
    pickle.dump(jb_test_df,
                open(pickle_out_returns, 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)

    print("produced and saved JB stats for :", symbols[symbolIdx], filesIdx)
示例#9
0
 def decompose_stl(self, keyword, robust=False):
     """
     Decomposition by STL LOESS
     :param robust: robust estimation orr not
     :param keyword: keyword to be used
     """
     ts = self.time_series(keyword)
     ts_bc = self.time_series_box_cox(keyword)
     decomp_add = seasonal.STL(ts, robust=robust).fit()
     decomp_mult = seasonal.STL(ts_bc, robust=robust).fit()
     if stats.jarque_bera(decomp_add.resid).pvalue > stats.jarque_bera(decomp_mult.resid).pvalue:
         self.decomposition = decomp_add
     else:
         self.decomposition = decomp_mult
     return self.decomposition
示例#10
0
def q2():
    # Retorne aqui o resultado da questão 2.
    alpha = 0.05
    amostra = get_sample(athletes, 'height', 3000)

    print('Retorno: ', sct.jarque_bera(amostra))

    p = sct.jarque_bera(amostra)[1]

    if p > alpha:
        return True
    else:
        return False

    pass
def normal(x):
    print('Shapiro-Wilk p =', stats.shapiro(x)[1])
    print('Jarque-Bera p =', stats.jarque_bera(x)[1])
    print('QQ plot')
    qqplot(x, line='s')
    pyplot.show()
    return 0
def JBtest(x):
    # 样本规模n
    n = x.size
    x_ = x - x.mean()
    """
    M2:二阶中心钜
    skew 偏度 = 三阶中心矩 与 M2^1.5的比
    krut 峰值 = 四阶中心钜 与 M2^2 的比
    """
    M2 = np.mean(x_**2)

    skew = np.mean(x_**3) / M2**1.5

    krut = np.mean(x_**4) / M2**2
    """
    计算JB统计量,以及建立假设检验
    """
    JB_s = n * (skew**2 / 6 + (krut - 3)**2 / 24)
    JB_p = 1 - stats.chi2.cdf(JB_s, df=2)

    print("偏度:", stats.skew(x), skew)

    print("峰值:", stats.kurtosis(x) + 3, krut)

    print("JB检验:", stats.jarque_bera(x))
    res = pd.DataFrame(
        [['skew', 'krut', '统计量', 'Sig'], [skew, krut, JB_s, JB_p]],
        index=["正态性检验", 'JB test'])
    return res
示例#13
0
def q2():

    # Da mesma forma que sct.shapiro(), sct.jarque_bera() retorna uma tupla com dois valores (test statistic, p-value)
    p_value = sct.jarque_bera(height_sample)[1]
    alpha = 0.05

    return False if p_value < alpha else True
示例#14
0
def q2():
    """
        #H0: A amostra tem distribuição normal
        Se p-valor < alpha, rejeita-se H0
        Se p-valor > alpha, não é possível rejeitar H0, a distribuição é normal
    """
    return sct.jarque_bera(sample_height)[1] > 0.05
示例#15
0
def q2():
    # Retorne aqui o resultado da questão 2.
    pass
        # Teste de Jarque-Bera 
    jarque = sct.jarque_bera(altura)
        # Comparando a p-value com a significancia de 5%
    return bool(jarque[1]>0.05)
示例#16
0
def main():
    # read the data
    data = pd.read_csv("CC GENERAL.csv")
    data.loc[(data['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].median()
    data.loc[(data['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = data['CREDIT_LIMIT'].median()
    data = data.drop(['CUST_ID'], 1)

    names = data.columns.tolist()

    # normalize the data
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    data_scaled = pd.DataFrame(data_scaled, columns=names)

    # apply Jurque-Bera Test
    print("Jurque-Bera Test:")
    for i in range(len(names)):
        X = data_scaled[names[i]]
        jb_value, p_value = jarque_bera(X)
        print("{} for {} feature, test value is {} and p-value is {}".format(i+1, names[i], jb_value, p_value))

    print("\n")

    # apply Anderson Test
    print("Anderson Test:")
    for i in range(len(names)):
        X = data_scaled[names[i]]
        a = anderson(X, dist='norm')
        print("for {} feature, test value is {}".format(names[i], a))
    print("\n")
示例#17
0
 def __analysis_index(self):
     index = self.get_env().query_data(Index_Data).get_data_serise()
     index_name = list(index.columns)
     index_name.remove(COM_DATE)
     index[index_name] = index[index_name].pct_change()/100
     index[index_name] = np.log(index[index_name]+1)
     index = index.set_index(COM_DATE)
     index.index = pd.to_datetime(index.index)
     res = pd.DataFrame(columns = ['mean','std','skew','kurt','jarque-Bera','adf','lm'])
     for index_name_ in index_name:
         fig, ax = plt.subplots()
         ax.plot(index[index_name_].dropna(), label=index_name_)
         ax.set_xlabel('时间')
         ax.set_ylabel('收益率的对数')
         ax.set_title(index_name_+'收益率图')
         ax.legend()
         plt.savefig(os.path.join(RESULTS, index_name_+'.png'))
         plt.close()
         fig, ax = plt.subplots()
         ax.hist(index[index_name_].dropna(),bins =25)
         ax.set_xlabel('收益率范围')
         ax.set_ylabel('收益率的对数')
         ax.set_title(index_name_+'收益率图')
         plt.savefig(os.path.join(RESULTS, index_name_+'bar.png'))
         plt.close()
         res.loc[index_name_] = [
             np.nanmean(index[index_name_].dropna()),
             np.nanstd(index[index_name_].dropna()),
             index[index_name_].dropna().skew(),
             index[index_name_].dropna().kurt(),
             stats.jarque_bera(index[index_name_].dropna())[0],
             adfuller(index[index_name_].dropna())[4]['5%'],
             q_stat(acf(index[index_name_].dropna())[1:13],len(index[index_name_].dropna()))[1][-1]
         ]
     res.to_csv(os.path.join(RESULTS,'index_info.csv'))
示例#18
0
def get_rf(frequency='daily', descriptives=False):
    # Give the location of the file
    script_path = os.getcwd()
    os.chdir(script_path)

    # Assign spreadsheet filename to `file`
    file = './data/RF_' + frequency + '.csv'

    # Load spreadsheet into dataframe
    df = pd.read_csv(file, header=0, index_col=0)
    df.index = pd.to_datetime(df.index)
    rf = df.filter(items=['rf'])
    mktrf = df.filter(items=['mktrf'])

    # Compute descriptive statistics if desired
    if descriptives:
        print(rf.min())
        print(rf.max())
        print(rf.mean())
        print(rf.var())
        print(rf.skew())
        print(rf.kurtosis())
        print(stats.jarque_bera(rf))

    return mktrf, rf
示例#19
0
def q2():
    alpha = 0.05
    a = get_sample(athletes, 'height', n=3000)
    if (sct.jarque_bera(sct.zscore(a))[1] <= alpha):
        return False
    else:
        return True
示例#20
0
def normalityCheckJB(sampleList):
    jbTest = stats.jarque_bera(sampleList)

    if jbTest[1] > 0.05:
        return "符合常態分配"
    else:
        return "不符合常態分配"
示例#21
0
def normality_of_residuals_test(model):
    '''
    Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to
    investigate the normality of residuals.

    Arg:
    * model - fitted OLS models from statsmodels
    '''
    sm.ProbPlot(model.resid).qqplot(line='s')
    plt.title('Q-Q plot')

    jb = stats.jarque_bera(model.resid)
    sw = stats.shapiro(model.resid)
    ad = stats.anderson(model.resid, dist='norm')
    ks = stats.kstest(model.resid, 'norm')

    print(f'Jarque-Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}')
    print(
        f'Shapiro-Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}')
    print(
        f'Kolmogorov-Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}'
    )
    print(
        f'Anderson-Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}'
    )
    print(
        'If the returned AD statistic is larger than the critical value, then for the 5% significance level, the null hypothesis that the data come from the Normal distribution should be rejected. '
    )
def q2():
    sample = get_sample(athletes, 'height', n=3000)
    stat, p = sct.jarque_bera(sample)
    alpha = 0.05
    if p > alpha:
        return True
    return False
示例#23
0
def q2():
    # Retorne aqui o resultado da questão 2.
    df = get_sample(athletes, 'height', n=3000)

    (jb_valuefloat,pvalue) = sct.jarque_bera(df)

    return bool(pvalue>=0.05)
示例#24
0
def q2():
    height_sample = get_sample(athletes, 'height', 3000)

    if (sct.jarque_bera(height_sample)[1] > 0.05):
        return True
    else:
        return False
示例#25
0
def normality_of_residuals_test(model):
    '''
    Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to 
    investigate the normality of residuals.
    
    Arg:
    * model - fitted OLS models from statsmodels
    '''

    sm.ProbPlot(model.resid).qqplot(line='s')
    plt.title('Q-Q Plot')

    jb = stats.jarque_bera(model.resid)
    sw = stats.shapiro(model.resid)
    ad = stats.anderson(model.resid, dist='norm')
    ks = stats.kstest(model.resid, 'norm')

    print(f'Jarque_Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}')
    print(
        f'Shapiro_Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}')
    print(
        f'Kolmogorov_Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}'
    )
    print(
        f'Anderson_Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}'
    )
def check_lr_assumptions(df, data_fe):
    """
    prints multiple statistical tests and returns a dataframe containing residuals
    
    arguments
    ---------
    df: dataframe of truth and prediction columns labeled "truth" and "pred"
    data_fe: prepared features for prediction
    
    return
    ------
    dataframe
    
    """
    
    df['residuals'] = df['pred'] - df['truth']
    
    print("mean of residuals:", df['residuals'].mean())
    print("variance of residuals:", df['residuals'].var())
    print("skewness of residuals:", stats.skew(df.residuals))
    print("kurtosis of residuals:", stats.kurtosis(df.residuals))
    print("kurtosis test of residuals:", stats.kurtosistest(df.residuals))
    print("normal test of residuals (scipy stats):", stats.normaltest(df.residuals))
    print("Jarque Bera test for normality of residuals:", stats.jarque_bera(df.residuals))
    print("Breusch Pagan test for heteroscedasticity:", het_breuschpagan(df.residuals, data_fe))

    return df
示例#27
0
def normality_tests(data_values):
    """

    :param data_values: values of returns in our case
    :return: print out a series of outcomes of whether the data fits a normal distribution or not!
    """

    stat, p = shapiro(data_values)
    print('stat = %.3f, p = %.3f\n ' % (stat, p))
    if p > 0.05:
        print('prob gaussian')
    else:
        print('non gaussian')

    stat_nt, p_nt = normaltest(data_values)

    print('stat = %.3f, p = %.3f\n ' % (stat_nt, p_nt))

    stat_jb, p_jb = jarque_bera(data_values)

    print('stat = %.3f, p = %.3f\n ' % (stat_jb, p_jb))

    if p_jb > 0.05:
        print('prob gaussian')
    else:
        print('non gaussian')
示例#28
0
def q2():
    # Retorne aqui o resultado da questão 2.
    sample_height = get_sample(athletes, 'height', n=3000)
    JB_and_pvalue_height = sct.jarque_bera(sample_height)
    pvalue_height = JB_and_pvalue_height[1]
    alpha = 0.05
    return bool(pvalue_height > alpha)
示例#29
0
def q2():

    height_athletes = get_sample(athletes, 'height', 3000)

    jarque_p = sct.jarque_bera(height_athletes)[1]
    alpha = 0.05

    return bool(jarque_p > alpha)
示例#30
0
文件: Stats.py 项目: parcap/nrp
 def jb_test(self, r: pd.Series, mode="stat"):
     self.check_instance(r, "pd.Series", "jb_test")
     r = r[~pd.isnull(r)]
     try:
         stat, p = sps.jarque_bera(r)
     except:
         stat, p = (np.nan, np.nan)
     return stat if mode == "stat" else p