def test_omni_normtest(): #tests against R fBasics from scipy import stats st_pv_R = np.array( [[3.994138321207883, -1.129304302161460, 1.648881473704978], [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]]) nt = omni_normtest(x) assert_almost_equal(nt, st_pv_R[:,0], 14) st = stats.skewtest(x) assert_almost_equal(st, st_pv_R[:,1], 14) kt = stats.kurtosistest(x) assert_almost_equal(kt, st_pv_R[:,2], 11) st_pv_R = np.array( [[34.523210399523926, 4.429509162503833, 3.860396220444025], [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]]) x2 = x**2 #TODO: fix precision in these test with relative tolerance nt = omni_normtest(x2) assert_almost_equal(nt, st_pv_R[:,0], 12) st = stats.skewtest(x2) assert_almost_equal(st, st_pv_R[:,1], 12) kt = stats.kurtosistest(x2) assert_almost_equal(kt, st_pv_R[:,2], 12)
def test_omni_normtest(): #tests against R fBasics from scipy import stats st_pv_R = np.array( [[3.994138321207883, -1.129304302161460, 1.648881473704978], [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]]) nt = omni_normtest(x) assert_almost_equal(nt, st_pv_R[:, 0], 14) st = stats.skewtest(x) assert_almost_equal(st, st_pv_R[:, 1], 14) kt = stats.kurtosistest(x) assert_almost_equal(kt, st_pv_R[:, 2], 11) st_pv_R = np.array( [[34.523210399523926, 4.429509162503833, 3.860396220444025], [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]]) x2 = x**2 #TODO: fix precision in these test with relative tolerance nt = omni_normtest(x2) assert_almost_equal(nt, st_pv_R[:, 0], 12) st = stats.skewtest(x2) assert_almost_equal(st, st_pv_R[:, 1], 12) kt = stats.kurtosistest(x2) assert_almost_equal(kt, st_pv_R[:, 2], 12)
def pd_reducer_ratios(pt, nan_policy='raise', **kwargs): ''' Takes a pandas dataframe with s_date, price & research columns Top tip: use the _rs_to_ptbl(recordset) to convert your query (with the fields s_date, s_type and s_val) into a valid pandas pivot table :) input = [ {'date':'2017-01-01', 'x':12.1, 'y':22 }, {'date':'2017-01-02', 'x':13.7, 'y':32.2}, {'date':'2017-01-03', 'x':11.7, 'y':12.8}, ] Returns a dict. ''' # axis==1 is column and axis==0 is row for all pandas operations requiring fn = "pandas_reducer_ratios" reducer_suite = __name__.split('.')[-1] try: from scipy import stats import pandas as pd except: raise ImportError('{} needs pandas and scipy') # Enforce our default in case it gets out of hand. if nan_policy not in nan_policies: raise AttributeError( \ 'nan_policy {} not accepted - try omit, raise or propagate'.format( nan_policy) ) output = {} output['prx_mean'] = pt.price.mean() output['prx_kurtosis_st'] = stats.kurtosistest(pt.price, nan_policy=nan_policy)[0] output['prx_kurtosis_pv'] = stats.kurtosistest(pt.price, nan_policy=nan_policy)[1] output['prx_skewtest_st'] = stats.skewtest(pt.price, nan_policy=nan_policy)[0] output['prx_skewtest_pv'] = stats.skewtest(pt.price, nan_policy=nan_policy)[1] output['prx_corr'] = pt.price.corr(pt.research) output['rsch_mean'] = pt.research.mean() output['rsch_kurtosis_st'] = stats.kurtosistest(pt.research, nan_policy=nan_policy)[0] output['rsch_kurtosis_pv'] = stats.kurtosistest(pt.research, nan_policy=nan_policy)[1] output['rsch_skewtest_st'] = stats.skewtest(pt.research, nan_policy=nan_policy)[0] output['rsch_skewtest_pv'] = stats.skewtest(pt.research, nan_policy=nan_policy)[1] output['rsch_corr'] = pt.research.corr(pt.price) kur_ratio = float(output['prx_kurtosis_st']) / output['rsch_kurtosis_st'] output['kurtosis_ratios'] = kur_ratio return output
def quick_perf_st(x, y, freq, rf): ''' Comprehensive Performance Analysis after running run_strategy() class method''' dfx = x.dropna() dfy = y.dropna() stats = { 'Statistics': [ 'P&L', 'CAGR', 'Anual_Vol', '%_Positive', 'Skew', 'Kurtosis', 'Kurtosis PV', 'Downside_Vol', 'Worst', 'Sharpe_Ratio', 'Sortino_Ratio', 'Information_Ratio', 'Max_Drawdown', 'Worst_3_drawdown_avg', 'Max_DD_Duration' ], 'Strategy': [ profit_loss(dfx), pl_CAGR(dfx), an_vol(dfx, freq), positive_per(dfx), s.skew(dfx), s.kurtosis(dfx), s.kurtosistest(dfx)[1], an_down_vol(dfx, freq), dfx.min(), sharpe(dfx, freq, rf), sortino(dfx, freq, rf), info_ratio(dfx, dfy, freq), max_draw(dfx), worst3_draw_avg(dfx), max_dd_duration(dfx) ], 'Benchmark': [ profit_loss(dfy), pl_CAGR(dfy), an_vol(dfy, freq), positive_per(dfy), s.skew(dfy), s.kurtosis(dfy), s.kurtosistest(dfy)[1], an_down_vol(dfy, freq), dfy.min(), sharpe(dfy, freq, rf), sortino(dfy, freq, rf), info_ratio(dfy, dfy, freq), max_draw(dfy), worst3_draw_avg(dfy), max_dd_duration(dfy) ] } df = pd.DataFrame(stats) df.set_index('Statistics', inplace=True) return df.round(4)
def get_stats(a): """Computes mean, D_T or D_R, and standard error for a list. """ a = np.asarray(a) n = a.shape[-1] keepdims = a.ndim > 1 M = np.nanmean(a, -1, keepdims=keepdims) # c = a - M # variance = np.einsum('...j,...j->...', c, c)/n variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1) SE = np.sqrt(variance) / sqrt(n - 1) SK = skew(a, -1, nan_policy='omit') KU = kurtosis(a, -1, nan_policy='omit') SK_t = skewtest(a, -1, nan_policy='omit') KU_t = kurtosistest(a, -1, nan_policy='omit') if keepdims: SK = SK[..., None] KU = KU[..., None] else: SK = float(SK) KU = float(KU) stat = { 'mean': M, 'var': variance, 'std': SE, 'skew': SK, 'skew_test': float(SK_t.statistic), 'kurt': KU, 'kurt_test': float(KU_t.statistic) } print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()]) return stat
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def normality_check(feature_group, output_path): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values ------------------------------------ Kurtosis | {0} Skewness | {1} NormalTest | {2} ''' result = temp.format(kr_test[1], sk_test[1], normaltest[1]) print result tests = (sk_test[1] > 0.05, kr_test[1] > 0.05, normaltest[1] > 0.05) return tests
def print_market_information(benchmark): print("RETURN BENCHMARK STATISTICS") print("---------------------------------------------") print("Mean of Daily Log Returns %9.6f" % np.mean(benchmark['returns'])) print("Std of Daily Log Returns %9.6f" % np.std(benchmark['returns'])) print("Mean of Annua. Log Returns %9.6f" % (np.mean(benchmark['returns']) * 252)) print("Std of Annua. Log Returns %9.6f" % (np.std(benchmark['returns']) * math.sqrt(252))) print("---------------------------------------------") print("Skew of Sample Log Returns %9.6f" % scs.skew(benchmark['returns'])) print("Skew Normal Test p-value %9.6f" % scs.skewtest(benchmark['returns'])[1]) print("---------------------------------------------") print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(benchmark['returns'])) print("Kurt Normal Test p-value %9.6f" % scs.kurtosistest(benchmark['returns'])[1]) print("---------------------------------------------") print("Normal Test p-value %9.6f" % scs.normaltest(benchmark['returns'])[1]) print("---------------------------------------------") print("Anderson Normality Test: ") print(stats.anderson(benchmark['returns'])) return
def plot_indices_returns_distribution(self): fig, axes = plt.subplots(nrows=4, ncols=2, sharex=True, sharey=False, figsize=(12, 12)) for i in range(4): for j in range(2): ts = self.__indices[2 * i + j].get_index_returns() # Statistics calculations ts = ts.dropna() mu, std = norm.fit(ts) kurtosis = stats.kurtosistest(ts).pvalue axes[i, j].axis('off') axes[i, j].set_title(self.__indices_names[2 * i + j]) axes[i, j].hist(ts.dropna(), bins=30, normed=True, color='red') xmin, xmax = axes[i, j].get_xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) axes[i, j].fill_between(x, 0, p, color='grey', alpha='0.7') axes[i, j].plot(x, p, 'k', linewidth=2) title = "%s, mu=%.2f, sigma=%.2f, kurt_pv=%.2f" % ( self.__indices_names[2 * i + j], mu, std, kurtosis) axes[i, j].set_title(title) plt.suptitle("Distribution of indices returns ")
def get_stats(a): """Computes mean, D_T or D_R, and standard error for a list. """ a = np.asarray(a) n = a.shape[-1] keepdims = a.ndim > 1 M = np.nanmean(a, -1, keepdims=keepdims) # c = a - M # variance = np.einsum('...j,...j->...', c, c)/n variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1) SE = np.sqrt(variance)/sqrt(n - 1) SK = skew(a, -1, nan_policy='omit') KU = kurtosis(a, -1, nan_policy='omit') SK_t = skewtest(a, -1, nan_policy='omit') KU_t = kurtosistest(a, -1, nan_policy='omit') if keepdims: SK = SK[..., None] KU = KU[..., None] else: SK = float(SK) KU = float(KU) stat = {'mean': M, 'var': variance, 'std': SE, 'skew': SK, 'skew_test': float(SK_t.statistic), 'kurt': KU, 'kurt_test': float(KU_t.statistic)} print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()]) return stat
def normality_check(feature_group,output_path): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values ------------------------------------ Kurtosis | {0} Skewness | {1} NormalTest | {2} ''' result = temp.format(kr_test[1],sk_test[1],normaltest[1]) print result tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05) return tests
def check_lr_assumptions(df, data_fe): """ prints multiple statistical tests and returns a dataframe containing residuals arguments --------- df: dataframe of truth and prediction columns labeled "truth" and "pred" data_fe: prepared features for prediction return ------ dataframe """ df['residuals'] = df['pred'] - df['truth'] print("mean of residuals:", df['residuals'].mean()) print("variance of residuals:", df['residuals'].var()) print("skewness of residuals:", stats.skew(df.residuals)) print("kurtosis of residuals:", stats.kurtosis(df.residuals)) print("kurtosis test of residuals:", stats.kurtosistest(df.residuals)) print("normal test of residuals (scipy stats):", stats.normaltest(df.residuals)) print("Jarque Bera test for normality of residuals:", stats.jarque_bera(df.residuals)) print("Breusch Pagan test for heteroscedasticity:", het_breuschpagan(df.residuals, data_fe)) return df
def print_statistics(data): print("RETURN SAMPLE STATISTICS") print("---------------------------------------------") print("Mean of Daily Log Returns %9.6f" % np.mean(data['returns'])) print("Std of Daily Log Returns %9.6f" % np.std(data['returns'])) print("Mean of Annua. Log Returns %9.6f" % (np.mean(data['returns']) * 252)) print("Std of Annua. Log Returns %9.6f" % \ (np.std(data['returns']) * math.sqrt(252))) print("---------------------------------------------") print("Skew of Sample Log Returns %9.6f" % scs.skew(data['returns'])) print("Skew Normal Test p-value %9.6f" % scs.skewtest(data['returns'])[1]) print("---------------------------------------------") print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns'])) print("Kurt Normal Test p-value %9.6f" % \ scs.kurtosistest(data['returns'])[1]) print("Normal Test p-value %9.6f" % \ scs.normaltest(data['returns'])[1]) print("---------------------------------------------") print("Realized Volatility %9.6f" % data['rea_vol'].iloc[-1]) print("Realized Variance %9.6f" % data['rea_var'].iloc[-1])
def ica_experiment(X, name, dims, max_iter=5000, tol=1e-04): """Run ICA on specified dataset and saves mean kurtosis results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ ica = FastICA(random_state=0, max_iter=max_iter, tol=tol) kurt = [] loss = [] X = StandardScaler().fit_transform(X) for dim in dims: print(dim) ica.set_params(n_components=dim) tmp = ica.fit_transform(X) df = pd.DataFrame(tmp) df = df.kurt(axis=0) kurt.append(kurtosistest(tmp).statistic.mean()) proj = ica.inverse_transform(tmp) loss.append(((X - proj)**2).mean()) res = pd.DataFrame({"kurtosis": kurt, "loss": loss}) # save results as CSV resdir = 'results/ICA' resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir) res.to_csv(resfile, index_label='n')
def test_normalitytests(): # numbers verified with R: dagoTest in package fBasics st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734) pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) x = np.array((-2,-1,0,1,2,3)*4)**2 yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal) yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew) yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
def doStatTests(X, labels, ml): pca = PCA(n_components=1) pca.fit(X[labels == ml]) XpcaML = pca.transform(X[labels == ml]) labelsOut = labels normXpcaML = (XpcaML - n.mean(XpcaML)) / n.std(XpcaML) #maxKurt = kurtosistest(normXpcaML)[1] #maxSkew = skewtest(normXpcaML)[1] for i in n.unique(labels): if len(X[labels == i]) == 0: continue else: Xpca = pca.transform(X[labels == i]) Xpca = (Xpca - n.mean(Xpca)) / n.std(Xpca) if len(Xpca) < 9: labelsOut[labels == i] = -1 continue if False: if len(Xpca) < 9: labelsOut[labels == i] = -1 continue pl.figure() if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5: tag = 'RFI' else: tag = 'Not RFI' sk = skewtest(Xpca)[1] kt = kurtosistest(Xpca)[1] sk1 = skewtest(XpcaML)[1] kt1 = kurtosistest(XpcaML)[1] pl.subplot(211) pl.hist(Xpca, 50, label=tag + ':' + str(sk) + ':' + str(kt)) pl.legend() pl.subplot(212) pl.hist(XpcaML, 50, label=tag + ':' + str(sk1) + ':' + str(kt1)) pl.legend() pl.show() if i == ml: continue if skewtest(Xpca)[1] > 0.01: #or kurtosistest(Xpca)[1] > 1.: labelsOut[labels == i] = -1 #else: # labelsOut[labels==i] = ml return labelsOut
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2,-1,0,1,2,3)*4)**2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def create_scipy_features(base_features, sentinel): r"""Calculate the skew, kurtosis, and other statistical features for each row. Parameters ---------- base_features : numpy array The feature dataframe. sentinel : float The number to be imputed for NaN values. Returns ------- sp_features : numpy array The calculated SciPy features. sp_fnames : list The SciPy feature names. """ logger.info("Creating SciPy Features") # Generate scipy features logger.info("SciPy Feature: geometric mean") row_gmean = sps.gmean(base_features, axis=1) logger.info("SciPy Feature: kurtosis") row_kurtosis = sps.kurtosis(base_features, axis=1) logger.info("SciPy Feature: kurtosis test") row_ktest, pvalue = sps.kurtosistest(base_features, axis=1) logger.info("SciPy Feature: normal test") row_normal, pvalue = sps.normaltest(base_features, axis=1) logger.info("SciPy Feature: skew") row_skew = sps.skew(base_features, axis=1) logger.info("SciPy Feature: skew test") row_stest, pvalue = sps.skewtest(base_features, axis=1) logger.info("SciPy Feature: variation") row_var = sps.variation(base_features, axis=1) logger.info("SciPy Feature: signal-to-noise ratio") row_stn = sps.signaltonoise(base_features, axis=1) logger.info("SciPy Feature: standard error of mean") row_sem = sps.sem(base_features, axis=1) sp_features = np.column_stack( (row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest, row_var, row_stn, row_sem)) sp_features = impute_values(sp_features, 'float64', sentinel) sp_features = StandardScaler().fit_transform(sp_features) # Return new SciPy features logger.info("SciPy Feature Count : %d", sp_features.shape[1]) sp_fnames = [ 'sp_geometric_mean', 'sp_kurtosis', 'sp_kurtosis_test', 'sp_normal_test', 'sp_skew', 'sp_skew_test', 'sp_variation', 'sp_signal_to_noise', 'sp_standard_error_of_mean' ] return sp_features, sp_fnames
def test_normalitytests(): # numbers verified with R: dagoTest in package fBasics st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734) pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) x = np.array((-2, -1, 0, 1, 2, 3) * 4)**2 yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal) yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew) yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
def doStatTests(X,labels,ml): pca = PCA(n_components=1) pca.fit(X[labels==ml]) XpcaML = pca.transform(X[labels==ml]) labelsOut = labels normXpcaML = (XpcaML-n.mean(XpcaML))/n.std(XpcaML) #maxKurt = kurtosistest(normXpcaML)[1] #maxSkew = skewtest(normXpcaML)[1] for i in n.unique(labels): if len(X[labels==i])==0: continue else: Xpca = pca.transform(X[labels==i]) Xpca = (Xpca-n.mean(Xpca))/n.std(Xpca) if len(Xpca) < 9: labelsOut[labels==i] = -1 continue if False: if len(Xpca) < 9: labelsOut[labels==i] = -1 continue pl.figure() if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5: tag = 'RFI' else: tag = 'Not RFI' sk = skewtest(Xpca)[1] kt = kurtosistest(Xpca)[1] sk1 = skewtest(XpcaML)[1] kt1 = kurtosistest(XpcaML)[1] pl.subplot(211) pl.hist(Xpca,50,label=tag+':'+str(sk)+':'+str(kt)) pl.legend() pl.subplot(212) pl.hist(XpcaML,50,label=tag+':'+str(sk1)+':'+str(kt1)) pl.legend() pl.show() if i == ml: continue if skewtest(Xpca)[1] > 0.01: #or kurtosistest(Xpca)[1] > 1.: labelsOut[labels==i] = -1 #else: # labelsOut[labels==i] = ml return labelsOut
def normality_stats(arr): """ 统计信息偏度,峰度,正态分布检测,p-value eg: input: 2014-07-25 223.57 2014-07-28 224.82 2014-07-29 225.01 ... 2016-07-22 222.27 2016-07-25 230.01 2016-07-26 225.93 output: array skew = -0.282635248604699 array skew p-value = 0.009884539532576725 array kurt = 0.009313464006726946 array kurt p-value = 0.8403947352953821 array norm = NormaltestResult(statistic=6.6961445106692237, pvalue=0.035152053009441256) array norm p-value = 0.035152053009441256 input: tsla bidu noah sfun goog vips aapl 2014-07-25 223.57 226.50 15.32 12.110 589.02 21.349 97.67 2014-07-28 224.82 225.80 16.13 12.450 590.60 21.548 99.02 2014-07-29 225.01 220.00 16.75 12.220 585.61 21.190 98.38 ... ... ... ... ... ... ... ... 2016-07-22 222.27 160.88 25.50 4.850 742.74 13.510 98.66 2016-07-25 230.01 160.25 25.57 4.790 739.77 13.390 97.34 2016-07-26 225.93 163.09 24.75 4.945 740.92 13.655 97.76 output: array skew = [-0.2826 -0.2544 0.1456 1.0322 0.2095 0.095 0.1719] array skew p-value = [ 0.0099 0.0198 0.1779 0. 0.0539 0.3781 0.1124] array kurt = [ 0.0093 -0.8414 -0.4205 0.4802 -1.547 -0.9203 -1.2104] array kurt p-value = [ 0.8404 0. 0.0201 0.0461 1. 0. 0. ] array norm = NormaltestResult(statistic=array([ 6.6961, 52.85 , 7.2163, 69.0119, 3.7161, 69.3468, 347.229 ]), pvalue=array([ 0.0352, 0. , 0.0271, 0. , 0.156 , 0. , 0. ])) array norm p-value = [ 0.0352 0. 0.0271 0. 0.156 0. 0. ] :param arr: pd.DataFrame or pd.Series or Iterable """ log_func = logging.info if ABuEnv.g_is_ipython else print log_func('array skew = {}'.format(scs.skew(arr))) log_func('array skew p-value = {}'.format(scs.skewtest(arr)[1])) log_func('array kurt = {}'.format(scs.kurtosis(arr))) log_func('array kurt p-value = {}'.format(scs.kurtosistest(arr)[1])) log_func('array norm = {}'.format(scs.normaltest(arr))) log_func('array norm p-value = {}'.format(scs.normaltest(arr)[1]))
def BasicSummary1(series): series_len = len(series) basiclist = [ stats.skew(series), stats.skewtest(series)[1], stats.kurtosis(series), stats.kurtosistest(series)[1], stats.variation(series) ] return np.round(pd.Series(basiclist), decimals=6)
def noise(fname, x0 = 100, y0 = 100, maxrad = 30): from astroML.plotting import hist hdulist = pf.open(fname) im = hdulist[0].data #print np.mean(im), np.min(im), np.max(im) #print im[95:105, 95:105] # x0, y0 = 100, 100 xi, yi = np.indices(im.shape) R = np.sqrt( (yi - int(y0))**2. + (xi - int(x0))**2. ) phot_a = np.zeros(maxrad + 1) phot_a[0] = 0 bmasked = im * ((R > maxrad) * (R < maxrad + 20.)) bdata = bmasked.flatten() #print bdata[bdata != 0.] #print len(bdata[bdata != 0.]) #print len(bdata) plt.subplot(3, 1, 1) hist(bdata[bdata != 0.], bins = 'blocks') plt.xlabel('Flux') plt.ylabel('(Bayesian Blocks)') plt.title('Noise') #plt.show() plt.subplot(3, 1, 2) hist(bdata[bdata != 0.], bins = 50) plt.xlabel('Flux') plt.ylabel('(50 bins)') #plt.title('Noise (50 bins)') #plt.show() plt.subplot(3, 1, 3) hist(bdata[bdata != 0.], bins = 'knuth') plt.xlabel('Flux') plt.ylabel('(Knuth\'s Rule)') #plt.title('Noise (Knuth\'s Rule)') plt.show() A2, crit, sig = anderson(bdata[bdata != 0.], dist = 'norm') print 'A-D Statistic:', A2 print ' CVs \t Sig.' print np.vstack((crit, sig)).T normality = normaltest(bdata[bdata != 0.]) print 'Normality:', normality skewness = skewtest(bdata[bdata != 0.]) print 'Skewness:', skewness kurtosis = kurtosistest(bdata[bdata != 0.]) print 'Kurtosis:', kurtosis print 'Mean:', np.mean(bdata[bdata != 0.]) print 'Median:', np.median(bdata[bdata != 0.])
def trainingset_preprocessing(Data, MinMaxInfo, print_info=False): ''' This function prepares the training set for pre-processing. Input: 1) Data: pandas DataFrame with all covariates ready for preprocessing. 2) MinMaxInfo: dictionary with {'covariate name': {'min': [] or value, 'max': [] or value}} 3) print_info: boolean - whether to show basic info about training set (True) or not (False). ''' ## Create local copy Data_local = Data.copy() ## Datasets from input data Columns_features = [ 'age', 'sex', 'WBC/uL', 'Mono/uL', 'Linfo/uL', 'T CD4 %', 'T CD4/uL', 'T CD8 %', 'T CD8/uL', 'CD4/CD8', 'NK %', 'NK/uL', 'B CD19 %', '% T CD4 HLADR POS', '% T CD8 HLADR POS', 'T NK-like %', 'LRTE % dei CD4', 'Mono DR %', 'MONO DR IFI' ] # Excluded features: 'T CD3 %', 'T CD3/uL', 'T CD3/HLADR %', 'T CD3 HLA DR/uL', # 'B CD19/uL', 'LRTE/uL', 'T CD8 HLADR %', 'T CD4 HLADR %' Columns_target = ['death', 'OS_days'] Columns_dates = ['hospitalization_date', 'death_date', 'birth_date'] # Data_X = Data_local.loc[:, Columns_features].astype(float) Data_Y = Data_local.loc[:, Columns_target].astype(float) Data_dates = Data_local.loc[:, Columns_dates].astype(float) Data_ID = Data_local.loc[:, ['ID']] Data_Age = Data_local.loc[:, ['age']] ## Apply x->log(1+x) where kurtosis is above threshold kurtosis_threshold = 6 skew_threshold = -1.5 X_kurtosis = kurtosistest(Data_X.values, axis=0, nan_policy='omit').statistic X_skew = Data_X.skew(axis=0) Features_LogProcessing = {} for i, element in enumerate(Columns_features): Features_LogProcessing[element] = {'Reflection': False, 'Log': False} if (X_kurtosis[i] > kurtosis_threshold and element != 'sex'): Features_LogProcessing[element]['Log'] = True if (X_skew[element] < skew_threshold): Features_LogProcessing[element]['Reflection'] = True max_val = MinMaxInfo[element]['max'] Data_X.loc[:, element] = max_val - Data_X.loc[:, element].values Data_X.loc[:, element] = np.log(1 + Data_X.loc[:, element].values) ## Return preprocessed datasets return Data_X, Data_Y, Data_ID, Data_Age, Features_LogProcessing # ---- # ---- # ---- # ---- # ---- # ---- # ---- # ---- #
def normality_tests(arr): ''' Tests for normality distribution of given data set. Parameters array: ndarray object to generate on ''' print("Skew of data set %14.3f" % scs.skew(arr)) print("Skew test p-value %14.3f" % scs.skewtest(arr)[1]) print("Kurt of data set %14.3f" % scs.kurtosis(arr)) print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1]) print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
def histo(self, s, x, y, bins=20): # 绘制单个变量的直方图 df = self.data[s] skewnes, sk = stats.skewtest(df) kurtosis, ku = stats.kurtosistest(df) sns.set(style="darkgrid") pc = sns.distplot(df, kde=True, bins=bins) plt.text(x=x, y=y, s='skewnes=%.2f\nkurtosis=%.2f' % (skewnes, kurtosis)) name = 'the Histograme of {:s}'.format(s.capitalize()) plt.suptitle(name) return pc
def test_vs_nonmasked(self): x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x)) assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x)) assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x)) funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] x = [1, 2, 3, 4] for func, mfunc in zip(funcs, mfuncs): assert_raises(ValueError, func, x) assert_raises(ValueError, mfunc, x)
def normality_test(arr): ''' Robust normality test based on skewness, kurtosis, and normality :param arr: obj to generate statistics on ''' print("Skew of data set %14.3f" % scs.skew(arr)) print("Skew test p-value %14.3f" % scs.skewtest(arr)[1]) print("Kurt of sata set %14.3f" % scs.kurtosis(arr)) print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1]) print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
def print_stock_statistics(data): print("RETURN SAMPLE STATISTICS") print("---------------------------------------------") print("Mean of Daily Log Returns %9.6f" % np.mean(returns)) print("Std of Daily Log Returns %9.6f" % np.std(returns)) print("Mean of Annua. Log Returns %9.6f" % (np.mean(returns) * 252)) print("Std of Annua. Log Returns %9.6f" % (np.std(returns) * math.sqrt(252))) print("---------------------------------------------") print("Skew of Sample Log Returns %9.6f" % scs.skew(returns)) print("Skew Normal Test p-value %9.6f" % scs.skewtest(returns)[1]) print("---------------------------------------------") print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(returns)) print("Kurt Normal Test p-value %9.6f" % scs.kurtosistest(returns)[1]) print("---------------------------------------------") print("Normal Test p-value %9.6f" % scs.normaltest(returns)[1]) print("---------------------------------------------") print("Realized Volatility %9.6f" % data['rea_vol'].iloc[-1]) print("Realized Variance %9.6f" % data['rea_var'].iloc[-1]) print("---------------------------------------------") print("Anderson Normality Test: ") print(stats.anderson(returns)) print("---------------------------------------------") print("Shapiro_Wilk Test: ") print(stats.shapiro(returns)) print("Sharpe Ratio of Daily Returns: ") print("{0:.8f}".format(np.mean(returns) / np.std(returns))) print("Trading Sharpe for Daily: ") print("{0:.8f}".format( (n * 6.5) * (np.mean(returns) - rf // np.std(returns) * np.sqrt(n * 6.5)))) print("Sharpe of Annua. Returns w/ days: ") print("{0:.8f}".format( (252) * (np.mean(returns) - rf // np.std(returns) * np.sqrt(252)))) print("Sharpe of Annua. Returns w/ days & hours:") print("{0:.8f}".format( (252 * 6.5) * (np.mean(returns) - rf // np.std(returns) * np.sqrt(252 * 6.5)))) print("---------------------------------------------") print("Amihud Illiquidity %9.6g" % np.mean(np.divide(abs(returns), dollar_vol[1:]))) print("---------------------------------------------") print("Kelly Formula: ") print("{0:.8f}".format(np.mean(returns) - rf // (np.std(returns))**2)) print("Compounded Levered Return: ") print("{0:.8f}".format(rf + ( ((252) * (np.mean(returns) - rf / np.std(returns) * np.sqrt(252)))**2) // 2)) print("Compounded Unlevered Return: ") print("{0:.8f}".format(((np.mean(returns)) * 252) - (((np.std(returns)) * np.sqrt(252))**2) // 2)) return
def normality_test(data): """ Tests for normality distribution of given data set (skew, skew-p, kurtosis, kurtosis-p, normality-p) data: ndarray object to generate statistics on """ print() print("Skew of data set %14.3f" % scs.skew(data)) print("Skew test p-value %14.3f" % scs.skewtest(data)[1]) print("Kurtosis of data set %14.3f" % scs.kurtosis(data)) print("Kurtosis test p-value %14.3f" % scs.kurtosistest(data)[1]) print("Normality test p-value %14.3f" % scs.normaltest(data)[1]) print()
def normality_test(array): ''' 对给定的数据集进行正态性检验 组合了3中统计学测试 偏度测试(Skewtest)——足够接近0 峰度测试(Kurtosistest)——足够接近0 正态性测试 ''' print 'Skew of data set %15.3f' % scs.skew(array) print 'Skew test p-value %14.3f' % scs.skewtest(array)[1] print 'Kurt of data set %15.3f' % scs.kurtosis(array) print 'Kurt test p-value %14.3f' % scs.kurtosistest(array)[1] print 'Norm test p-value %14.3f' % scs.normaltest(array)[1]
def normality_test(arr): '''Tests for normality distribution of givven data set. Parameters ========== array: ndarray object to generates statistics on ''' print 'Skew of data set %14.3f' %scs.skew(arr) print 'Skew test p value %14.3f' %scs.skewtest(arr)[1] print 'Kurt of data set %14.3f' %scs.kurtosis(arr) print 'Kurt test p value %14.3f' %scs.kurtosistest(arr)[1] print 'Normal test p value %14.3f' %scs.normaltest(arr)[1]
def kurtosisstats(timecourse): """ Parameters ---------- timecourse: array The timecourse to test :return: """ testres = kurtosistest(timecourse) return kurtosis(timecourse), testres[0], testres[1]
def test_vs_nonmasked(self): x = np.array((-2,-1,0,1,2,3)*4)**2 assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x)) assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x)) assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x)) funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] x = [1, 2, 3, 4] for func, mfunc in zip(funcs, mfuncs): assert_raises(ValueError, func, x) assert_raises(ValueError, mfunc, x)
def normalityTests(self): # Convert the close price into log valves logReturn = np.log(self.data.close / self.data.close.shift(1)) # The shift command will add a NaN to the beginning of the data, this will need to be removed. logReturn = logReturn.dropna() print('Skew of data set %14.3f' % scs.skew(logReturn)) print('Skew test p-value %14.3f' % scs.skewtest(logReturn)[1]) print('Kurt of data set %14.3f' % scs.kurtosis(logReturn)) print('Kurt test p-value %14.3f' % scs.kurtosistest(logReturn)[1]) print('Norm test p-value %14.3f' % scs.normaltest(logReturn)[1])
def normality_test(arr): '''Tests for normality distribution of givven data set. Parameters ========== array: ndarray object to generates statistics on ''' print 'Skew of data set %14.3f' % scs.skew(arr) print 'Skew test p value %14.3f' % scs.skewtest(arr)[1] print 'Kurt of data set %14.3f' % scs.kurtosis(arr) print 'Kurt test p value %14.3f' % scs.kurtosistest(arr)[1] print 'Normal test p value %14.3f' % scs.normaltest(arr)[1]
def get_normality(data: pd.DataFrame) -> pd.DataFrame: """ Look at the distribution of returns and generate statistics on the relation to the normal curve. This function calculates skew and kurtosis (the third and fourth moments) and performs both a Jarque-Bera and Shapiro Wilk test to determine if data is normally distributed. Parameters ---------- df : pd.DataFrame Dataframe of targeted data Returns ------- pd.DataFrame Dataframe containing statistics of normality """ # Kurtosis # Measures height and sharpness of the central peak relative to that of a standard bell curve k, kpval = stats.kurtosistest(data) # Skewness # Measure of the asymmetry of the probability distribution of a random variable about its mean s, spval = stats.skewtest(data) # Jarque-Bera goodness of fit test on sample data # Tests if the sample data has the skewness and kurtosis matching a normal distribution jb, jbpval = stats.jarque_bera(data) # Shapiro # The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution. sh, shpval = stats.shapiro(data) # Kolmogorov-Smirnov # The one-sample test compares the underlying distribution F(x) of a sample against a given distribution G(x). # Comparing to normal here. ks, kspval = stats.kstest(data, "norm") l_statistic = [k, s, jb, sh, ks] l_pvalue = [kpval, spval, jbpval, shpval, kspval] return pd.DataFrame( [l_statistic, l_pvalue], columns=[ "Kurtosis", "Skewness", "Jarque-Bera", "Shapiro-Wilk", "Kolmogorov-Smirnov", ], index=["Statistic", "p-value"], )
def calculate(self): if not self.recalc: raise ValueError("Please set recalc to True") data = np.array(self.dataset() or []) self.obs_number = len(data) self.recalc = False if self.obs_number == 0: return self.avg = np.average(data) self.q25, self.median, self.q75 = np.percentile(data, (25, 50, 75)) modedata = stats.mode(data) if modedata[1][0] > 1: self.mode = modedata[0][0] self.min = np.min(data) self.max = np.max(data) self.sum = np.sum(data) self.Q = extra_stats.Q(data) self.TRI = extra_stats.TRI(data) self.MID = extra_stats.MID(data) self.var = np.var(data) self.std = np.std(data) self.range = extra_stats.range(data) self.MD = extra_stats.MD(data) self.MeD = extra_stats.MeD(data) self.variation = stats.variation(data) self.varQ = extra_stats.varQ(data) if self.obs_number > 1: self.Sp_pearson = extra_stats.Sp_pearson(data) self.H1_yule = extra_stats.H1_yule(data) self.H3_kelly = extra_stats.H3_kelly(data) else: self.Sp_pearson = None self.H1_yule = None self.H3_kelly = None self.kurtosis = stats.kurtosis(data) if self.obs_number >= 20: self.kurtosis_test_z_score, self.kurtosis_test_p_value = ( stats.kurtosistest(data)) else: self.kurtosis_test_z_score, self.kurtosis_test_p_value = None, None
def normality_tests(arr): ''' Tests for normality distribution of given data set. normality_tests函数组合了3中不同的统计学测试: 偏斜度测试(Skewtest) 测试样本数据的偏斜度是否“正态”(也就是值足够接近0) 峰度测试(kurtosistest)与上一种测试类似,测试样本数据的峰度是否“正态”(同样是值足够接近0) 正态性测试(normaltest) 组合其他两种测试方法,检验正态性 ''' print("Skew of data set %14.3f" % scs.skew(arr)) print("Skew test p-value %14.3f" % scs.skewtest(arr)[1]) print("Kurt of data set %14.3f" % scs.kurtosis(arr)) print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1]) print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
def print_statistics(data): print "RETURN SAMPLE STATISTICS" print "---------------------------------------------" print "Mean of Daily Log Returns %9.6f" % np.mean(data['returns']) print "Std of Daily Log Returns %9.6f" % np.std(data['returns']) print "Mean of Annua. Log Returns %9.6f" % (np.mean(data['returns']) * 252) print "Std of Annua. Log Returns %9.6f" % \ (np.std(data['returns']) * math.sqrt(252)) print "---------------------------------------------" print "Skew of Sample Log Returns %9.6f" % scs.skew(data['returns']) print "Skew Normal Test p-value %9.6f" % scs.skewtest(data['returns'])[1] print "---------------------------------------------" print "Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns']) print "Kurt Normal Test p-value %9.6f" % \ scs.kurtosistest(data['returns'])[1] print "---------------------------------------------" print "Normal Test p-value %9.6f" % \ scs.normaltest(data['returns'])[1] print "---------------------------------------------" print "Realized Volatility %9.6f" % data['rea_vol'].iloc[-1] print "Realized Variance %9.6f" % data['rea_var'].iloc[-1]
def normality_check(feature_group,group_name): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values[{}] ------------------------------------ Kurtosis | {} Skewness | {} NormalTest | {} ''' result = temp.format(group_name,kr_test[1],sk_test[1],normaltest[1]) tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05) return result,tests
cov_matrix = np.corrcoef(x.flat, y.flat) r2_xx = cov_matrix[0, 0] r2_xy = cov_matrix[0, 1] r2_yx = cov_matrix[1, 0] r2_yy = cov_matrix[1, 1] skew_x = stats.skew(x.flat, 0, bias=False) skew_y = stats.skew(y.flat, 0, bias=False) skew_xz, skew_xp = stats.skewtest(x.flat, 0) skew_yz, skew_yp = stats.skewtest(y.flat, 0) kurtosis_x = stats.kurtosis(x.flat, 0, bias=False) kurtosis_y = stats.kurtosis(y.flat, 0, bias=False) kurtosis_xz, kurtosis_xp = stats.kurtosistest(x.flat, 0) kurtosis_yz, kurtosis_yp = stats.kurtosistest(y.flat, 0) results = collections.OrderedDict() results["x_path"] = os.path.basename(opts.inputA) results["y_path"] = os.path.basename(opts.inputB) results["x"] = x results["y"] = y results["x_avg"] = x_avg results["y_avg"] = y_avg results["x_std"] = x_std results["y_std"] = y_std
#datap[j,i] = data[j,i] - biasp[j,i] return biasf, dataf # ,biasp,datap if __name__ == "__main__": files = ['big0.csv', 'big1.csv', 'big2.csv', 'big3.csv'] [observations, temp] = parse.separate(files) for obs in observations: [biasf, dataf] = removeBias(obs) plt.close('all') for i in range(obs.shape[0]): biasrate = np.array(np.diff(biasf[i, :])) print "\nTest for biasrate is ", stats.kurtosistest(biasrate) print "Test for white noise is ", stats.kurtosistest(dataf[i, :]) print "Test for bias is ", stats.kurtosistest(biasf[i, :]) print "Test for observation is ", stats.kurtosistest(obs[i, :]), "\n" plt.figure(i + 1) plt.clf() plt.subplot2grid((2, 2), (0, 0)) plt.hist(obs[i, :], color='r') plt.subplot2grid((2, 2), (0, 1)) plt.hist(dataf[i, :], color='g') plt.subplot2grid((2, 2), (1, 0))
def kurtosistp(self, x): return kurtosistest(x)[1]
#scipy.io包的函数可以在Python中加载或保存MATLAB和Octave的矩阵和数组 #loadmat函数可以加载.mat文件。savemat函数可以将数组和指定的变量名字典保存为.mat文件 a = np.arange(7) io.savemat("a.mat",{"array":a}) print u"分析随机数" from scipy import stats import matplotlib.pyplot as plt #使用scipy.stats包按正态分布生成随机数 generated = stats.norm.rvs(size=900) #用正态分布去拟合生成的数据,得到均值和标准差 print "Mean","Std",stats.norm.fit(generated) #偏度描述的是概率分布的偏斜程度。 print "Skewtest","pvalue",stats.skewtest(generated) #峰度描述的是概率分布曲线的陡峭程度 print "Kurtosistest","pvalue",stats.kurtosistest(generated) #正态性检验可以检查数据集服从正态分布程度 print "Normaltest","pvalue",stats.normaltest(generated) #得到数据所在的区段中某一百分比处的数值 print "95 percentile",stats.scoreatpercentile(generated,95) #从数值1出发找到对应的百分比 print "Percentile at 1",stats.percentileofscore(generated,1) plt.hist(generated) # plt.show() print u"比较对数收益率" # from matplotlib.finance import quotes_historical_yahoo # from datetime import date # from statsmodels.stats.stattools import jarque_bera # def get_close(symbol): # today = date.today()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
df['abc'].hist(bins = 100, figsize = (8, 6)) #%% # QQ-plot # quantile-quantile plot # to verify if this distribution is normal or not import statsmodels.api as sm import matplotlib.pyplot as plt sm.qqplot(df['abc'].dropna(), line = 's') plt.grid(True) plt.xlabel('theoretical quantiles') plt.ylabel('sample quantiles') #%% # skew and kurtosis import scipy.stats as scs data = df['floats'].dropna() # remove missing value print('skew is %f' %scs.skew(data)) print('skew test p-value is %f' %scs.skewtest(data)[1]) print('kurt is %f' %scs.kurtosis(data)) print('kurt test p-value is %f' % scs.kurtosistest(data)[1]) print('normal test p-value is %f' %scs.normaltest(data)[1]) #%%
def normality_test(arr): print "skew %14.3f"% scs.skew(arr) print "skew test p-value %14.3f" % scs.skewtest(arr)[1] print "Kurt of data set %14.3f" % scs.kurtosis(arr) print "Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1] print "Norm test p-value %14.3f" % scs.normaltest(arr)[1]
def BasicSummary1(series): series_len = len(series) basiclist=[stats.skew(series), stats.skewtest(series)[1], stats.kurtosis(series),stats.kurtosistest(series)[1],stats.variation(series)] return np.round(pd.Series(basiclist),decimals=6)
mvi2mot = {} for recname in sorted(mot): rec = eval(recname) mviname = os.path.basename(rec.e0.s.fname) framei0, framei1 = rec.e0.d.framei[0], rec.e0.d.framei[-1] print('%s: %s, frameis %d:%d' % (recname, mviname, framei0, framei1)) mvi2mot[(mviname, framei0)] = mot[recname] allmotion = np.hstack(list(mvi2mot.values())) allmotion = np.hstack([allmotion, -allmotion]) # make it symmetric around 0 motionbins = np.arange(-300, 300+MOTIONBINW, MOTIONBINW) # deg/s, symmetric around 0 midbins = motionbins[:-1] + MOTIONBINW / 2 motioncount = np.histogram(allmotion, bins=motionbins)[0] k = kurtosis(allmotion) # kurtosistest() seems to use the method of Anscombe & Glynn (1983), # http://biomet.oxfordjournals.org/content/70/1/227 z, p = kurtosistest(allmotion) pstring = 'p < %g' % ceilsigfig(p) # normally distributed signal with same std as data, to check that its kurtosis is 0: #nsamples = 10000000 #normal = scipy.random.normal(0, allmotion.std(), nsamples) #normalcount = np.histogram(normal, bins=motionbins)[0] normalcount = core.g(0, allmotion.std(), midbins) # generate normal distrib directly # normalize to get same probability mass: normalcount = normalcount / normalcount.sum() * motioncount.sum() plot(midbins, normalcount, marker=None, ls='-', c='0.7', lw=2) plot(midbins, motioncount, marker=None, ls='-', c='k', lw=2) text(0.98, 0.98, 'k = %.1f' % k, # kurtosis horizontalalignment='right', verticalalignment='top', transform=gca().transAxes, color='k') text(0.98, 0.90, '%s' % pstring, # p-value of null (normal) hypothesis of kurtosis test horizontalalignment='right', verticalalignment='top',
def cluster(data_array, target_array, n_components, perform_nn, max_k, max_features): # normalize numeric values for better performance per docs data_array = StandardScaler().fit_transform(data_array, target_array) target2 = None test_target = None if target_array is not None: data2, target2 = shuffle(data_array, target_array, random_state=1) else: data2 = shuffle(data_array, random_state=1) # split training and testing data 70/30 offset = int(0.7*len(data2)) train_data = data2[:offset] test_data = data2[offset:] if target_array is not None: train_target = target2[:offset] test_target = target2[offset:] best_k_rs, scores = clusterK(train_data, test_data, max_k) plot_chart(scores, 'Silhouette Score', 'K Means', 'k') best_em_rs, scores = clusterEM(train_data, test_data, max_k) plot_chart(scores, 'Silhouette Score', 'EM', 'Components') plot_eigenvalues(PCA(), train_data, 'PCA') t0 = int(round(time.time() * 1000)) PCA().fit_transform(train_data) t1 = int(round(time.time() * 1000)) pca_time = t1-t0 print('PCA time: %d ms' % pca_time) reduce_and_cluster(build_pca, train_data, test_data, max_features, n_components, max_k, 'PCA') # calculate and plot kurtosis to determine any components that can be dropped t0 = int(round(time.time() * 1000)) ica = FastICA(max_iter=18000, random_state=1) s = ica.fit_transform(train_data) t1 = int(round(time.time() * 1000)) ica_time = t1-t0 print('ICA time: %d ms' % ica_time) z_score, p_score = stats.kurtosistest(s) pl.plot(z_score, 'ro') pl.title('Kurtosis Z-scores') pl.ylabel('Z-score') pl.xlabel('Component Index') pl.show() reduce_and_cluster(build_ica, train_data, test_data, max_features, n_components, max_k, 'ICA') rp_c, rp_rs = find_best_rp(train_data, test_data, 2, max_features) reduce_and_cluster(build_rp, train_data, test_data, max_features, rp_c, max_k, 'RP', rp_rs) reduce_and_cluster(build_svd, train_data, test_data, max_features-1, n_components, max_k, 'SVD') if perform_nn: errs = [] durs = [] err, dur = reduce_and_learn_nn(PCA(n_components=n_components), train_data, train_target, test_data, test_target) errs.append(err) durs.append(dur) # ICA had 4 components as the best err, dur = reduce_and_learn_nn(FastICA(max_iter=18000, random_state=1, n_components=4), train_data, train_target, test_data, test_target) errs.append(err) durs.append(dur) # num components = 7 for RP err, dur = reduce_and_learn_nn(GaussianRandomProjection(n_components=rp_c, random_state=rp_rs), train_data, train_target, test_data, test_target) errs.append(err) durs.append(dur) err, dur = reduce_and_learn_nn(TruncatedSVD(n_components=n_components, algorithm='arpack', random_state=1), train_data, train_target, test_data, test_target) errs.append(err) durs.append(dur) ind = np.arange(len(errs)) bar_width = 0.3 fig, ax = plt.subplots() ax.bar(ind, errs, bar_width, color='b') ax.set_ylabel('Mean Squared Error') ax.set_title('Error After Dimension Reduction') ax.set_xticks(ind+.15) ax.set_xticklabels(('PCA', 'ICA', 'RP', 'SVD')) pl.show() cluster_and_learn_nn(train_data, train_target, test_data, test_target)