def summary_diff_dtype(x): if x.dtype.name in ['object', 'bool', 'category' ] and len(x.unique()) <= max_lev: vc = x.value_counts(dropna=False, normalize=True) s = '' for name, v in zip(vc.index, vc.values): s += f'{name} {v*100:>2.0f}%' s += '<br>' if in_cell_next_line else ', ' return s[:-2] elif x.dtype.name in ['float64', 'int64']: o = f'quantiles: {x.quantile(q=[0, 0.25, 0.5, 0.75, 1]).values.tolist()}{in_cell_next} \ mean: {x.mean():.2f}\ std: {x.std():.2f} \ cv: {x.std()/x.mean():.2f}{in_cell_next}\ skew: {skew(x[x.notnull()]):.2f}' if sum(x.notnull()) > 8: # requirement of skewtest p = skewtest(x[x.notnull()]).pvalue o += f'*' if p <= 0.05 else '' if min(x[x != 0]) > 0 and len(x[x != 0]) > 8: # take log o += f'{in_cell_next}log skew: {skew(np.log(x[x>0])):.2f}' p = skewtest(np.log(x[x != 0])).pvalue o += f'*' if p != p and p <= 0.05 else '' return o elif 'datetime' in x.dtype.name: # o = '' qs = x.quantile(q=[0, 0.25, 0.5, 0.75, 1]).values return print_list([np.datetime_as_string(q)[0:16] for q in qs], br=in_cell_next) else: return ''
def stats_on_list_of_sizes(in_list1, in_list2): """function to perform stats on two lists of seq lens. Returns as a tab separeated string: as_skew, in_list1_skew, ttest, Man_u_value, Man_p_value""" in_list1_skew = ('normal skewtest in_list1 = %6.3f pvalue = %6.4f' % stats.skewtest(in_list1)) in_list2_skew = ('normal skewtest in_list2 = %6.3f pvalue = %6.4f' % stats.skewtest(in_list2)) ttest = ('t-statistic = %6.3f pvalue = %6.4f' % stats.ttest_ind(in_list1, in_list2)) Man_u_value, Man_p_value = mannwhitneyu(in_list1, in_list2, alternative="two-sided") outdata = "\t".join([ in_list1_skew, in_list2_skew, ttest, str(Man_u_value), str(Man_p_value) ]) skew_t = "\t".join([ "in_list1_skew: %s" % in_list1_skew, "in_list2_skew: %s" % in_list2_skew, "Mann_whitney U test P value: %s" % Man_p_value ]) return skew_t
def test_omni_normtest(): #tests against R fBasics from scipy import stats st_pv_R = np.array( [[3.994138321207883, -1.129304302161460, 1.648881473704978], [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]]) nt = omni_normtest(x) assert_almost_equal(nt, st_pv_R[:,0], 14) st = stats.skewtest(x) assert_almost_equal(st, st_pv_R[:,1], 14) kt = stats.kurtosistest(x) assert_almost_equal(kt, st_pv_R[:,2], 11) st_pv_R = np.array( [[34.523210399523926, 4.429509162503833, 3.860396220444025], [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]]) x2 = x**2 #TODO: fix precision in these test with relative tolerance nt = omni_normtest(x2) assert_almost_equal(nt, st_pv_R[:,0], 12) st = stats.skewtest(x2) assert_almost_equal(st, st_pv_R[:,1], 12) kt = stats.kurtosistest(x2) assert_almost_equal(kt, st_pv_R[:,2], 12)
def test_omni_normtest(): #tests against R fBasics from scipy import stats st_pv_R = np.array( [[3.994138321207883, -1.129304302161460, 1.648881473704978], [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]]) nt = omni_normtest(x) assert_almost_equal(nt, st_pv_R[:, 0], 14) st = stats.skewtest(x) assert_almost_equal(st, st_pv_R[:, 1], 14) kt = stats.kurtosistest(x) assert_almost_equal(kt, st_pv_R[:, 2], 11) st_pv_R = np.array( [[34.523210399523926, 4.429509162503833, 3.860396220444025], [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]]) x2 = x**2 #TODO: fix precision in these test with relative tolerance nt = omni_normtest(x2) assert_almost_equal(nt, st_pv_R[:, 0], 12) st = stats.skewtest(x2) assert_almost_equal(st, st_pv_R[:, 1], 12) kt = stats.kurtosistest(x2) assert_almost_equal(kt, st_pv_R[:, 2], 12)
def pd_reducer_ratios(pt, nan_policy='raise', **kwargs): ''' Takes a pandas dataframe with s_date, price & research columns Top tip: use the _rs_to_ptbl(recordset) to convert your query (with the fields s_date, s_type and s_val) into a valid pandas pivot table :) input = [ {'date':'2017-01-01', 'x':12.1, 'y':22 }, {'date':'2017-01-02', 'x':13.7, 'y':32.2}, {'date':'2017-01-03', 'x':11.7, 'y':12.8}, ] Returns a dict. ''' # axis==1 is column and axis==0 is row for all pandas operations requiring fn = "pandas_reducer_ratios" reducer_suite = __name__.split('.')[-1] try: from scipy import stats import pandas as pd except: raise ImportError('{} needs pandas and scipy') # Enforce our default in case it gets out of hand. if nan_policy not in nan_policies: raise AttributeError( \ 'nan_policy {} not accepted - try omit, raise or propagate'.format( nan_policy) ) output = {} output['prx_mean'] = pt.price.mean() output['prx_kurtosis_st'] = stats.kurtosistest(pt.price, nan_policy=nan_policy)[0] output['prx_kurtosis_pv'] = stats.kurtosistest(pt.price, nan_policy=nan_policy)[1] output['prx_skewtest_st'] = stats.skewtest(pt.price, nan_policy=nan_policy)[0] output['prx_skewtest_pv'] = stats.skewtest(pt.price, nan_policy=nan_policy)[1] output['prx_corr'] = pt.price.corr(pt.research) output['rsch_mean'] = pt.research.mean() output['rsch_kurtosis_st'] = stats.kurtosistest(pt.research, nan_policy=nan_policy)[0] output['rsch_kurtosis_pv'] = stats.kurtosistest(pt.research, nan_policy=nan_policy)[1] output['rsch_skewtest_st'] = stats.skewtest(pt.research, nan_policy=nan_policy)[0] output['rsch_skewtest_pv'] = stats.skewtest(pt.research, nan_policy=nan_policy)[1] output['rsch_corr'] = pt.research.corr(pt.price) kur_ratio = float(output['prx_kurtosis_st']) / output['rsch_kurtosis_st'] output['kurtosis_ratios'] = kur_ratio return output
def apply_log(sf): #when to apply log: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3591587/ for column in list(sf.columns): values = sf[column].tolist() skewness = stats.skewtest(values).statistic if (skewness > 1.96) or (skewness < -1.96): #print(skewness) sf[column] = sf[column].apply(numpy.log) #print("applied log to ", column) values = sf[column].tolist() skewness = stats.skewtest(values).statistic #print("now it is ", skewness) return sf
def compute_alpha(self, column): """Find the best alpha attainable with the given parameters""" lower = -1 * column.min() pval_prev = 0.0 pval = skewtest(np.log(column + lower + self.incr)).pvalue i = 1 while i <= self.max_iter and pval > pval_prev: i += 1 pval_prev = pval pval = skewtest(np.log(column + lower + i * self.incr)).pvalue if pval_prev > skewtest(column).pvalue: return lower + (i - 1) * self.incr return None
def get_stats(a): """Computes mean, D_T or D_R, and standard error for a list. """ a = np.asarray(a) n = a.shape[-1] keepdims = a.ndim > 1 M = np.nanmean(a, -1, keepdims=keepdims) # c = a - M # variance = np.einsum('...j,...j->...', c, c)/n variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1) SE = np.sqrt(variance) / sqrt(n - 1) SK = skew(a, -1, nan_policy='omit') KU = kurtosis(a, -1, nan_policy='omit') SK_t = skewtest(a, -1, nan_policy='omit') KU_t = kurtosistest(a, -1, nan_policy='omit') if keepdims: SK = SK[..., None] KU = KU[..., None] else: SK = float(SK) KU = float(KU) stat = { 'mean': M, 'var': variance, 'std': SE, 'skew': SK, 'skew_test': float(SK_t.statistic), 'kurt': KU, 'kurt_test': float(KU_t.statistic) } print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()]) return stat
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def normality_check(feature_group, output_path): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values ------------------------------------ Kurtosis | {0} Skewness | {1} NormalTest | {2} ''' result = temp.format(kr_test[1], sk_test[1], normaltest[1]) print result tests = (sk_test[1] > 0.05, kr_test[1] > 0.05, normaltest[1] > 0.05) return tests
def stats_on_list_of_sizes(vals): """function to perform stats on a list skew, """ skew = ('normal skewtest vals = %6.3f pvalue = %6.4f' % stats.skewtest(vals)) return skew
def create_bins(df, attribute): """ This function defines the bins that are going to be used to categorise the numerical variables. It takes as inputs: @df: the dataframe that contain the variable to be processed @attribute: is the name of the variable --------------------------------------------------------------------------- In particular, it is built on two steps. The first one provide the computation of the skewness of the distribution of the attribute without taking into account those samples that don't lie in the IQR, hence the plausible outliers. In general the skewness of an attribute is an in indicator of the simmetry of its distibution. Whether it is a positive value there is more weight in left tail of the distribution, otherwise (negative values) the weight is in the right tail. The Skew Test is performed to check whether the Skew is significally different from 0. Precisely: H0: the skew of the distribution the data are drawn from is equal to that of the normal distribution (equal to 0). --------------------------------------------------------------------------- The result of the test determines the way in which the bins are created. In particular, whether the Skew is significally different from zero, the method used to create the bins is the Doane, a particular estimator which takes into account the skew of the data. Otherwise the Auto method is used to estimate the bins. (a brief description of these estimators is avaiable in this documentation: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html). ------------------------------------------------------------------------""" # Get the end points of the IQR B = plt.boxplot(df[attribute]) plt.close() min_max = [item.get_ydata()[1] for item in B['whiskers']] # Perform the statistical test skew_pvalue = skewtest(df[attribute][df[attribute] >= min_max[0]])[1] # Whether significally different from zero if skew_pvalue < 0.05: # Use the Doane method bins = np.histogram(df[attribute], bins = 'doane')[1] bins_interval = [(bins[i], bins[i+1]) for i in range(len(bins)-1)] # Otherwise else: # Use the auto method bins = np.histogram(df[attribute], bins = 'auto')[1] bins_interval = [(bins[i], bins[i+1]) for i in range(len(bins)-1)] return bins_interval
def normality_check(feature_group,output_path): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values ------------------------------------ Kurtosis | {0} Skewness | {1} NormalTest | {2} ''' result = temp.format(kr_test[1],sk_test[1],normaltest[1]) print result tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05) return tests
def skewness(_: pathlib.Path, data: pd.DataFrame): statistic, pvalue = skewtest(data) return pd.DataFrame({ "skew": skew(data), "statistic": statistic, "pvalue": pvalue })
def print_market_information(benchmark): print("RETURN BENCHMARK STATISTICS") print("---------------------------------------------") print("Mean of Daily Log Returns %9.6f" % np.mean(benchmark['returns'])) print("Std of Daily Log Returns %9.6f" % np.std(benchmark['returns'])) print("Mean of Annua. Log Returns %9.6f" % (np.mean(benchmark['returns']) * 252)) print("Std of Annua. Log Returns %9.6f" % (np.std(benchmark['returns']) * math.sqrt(252))) print("---------------------------------------------") print("Skew of Sample Log Returns %9.6f" % scs.skew(benchmark['returns'])) print("Skew Normal Test p-value %9.6f" % scs.skewtest(benchmark['returns'])[1]) print("---------------------------------------------") print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(benchmark['returns'])) print("Kurt Normal Test p-value %9.6f" % scs.kurtosistest(benchmark['returns'])[1]) print("---------------------------------------------") print("Normal Test p-value %9.6f" % scs.normaltest(benchmark['returns'])[1]) print("---------------------------------------------") print("Anderson Normality Test: ") print(stats.anderson(benchmark['returns'])) return
def plot_feature_density(self, feat_names = None, kind="density", bins = 30): """ Plot the density of feature values df: DataFrame feat_names: feature of interest, by default all numerical features kind: {"density", "hist"} """ ## numerical features feat_names = self.find_numerical_features() if feat_names is None else np.asarray(feat_names) df = self.data.loc[:, feat_names] nrows, ncols = int(math.ceil(feat_names.shape[0] / 3.)), 3 fig, axes = plt.subplots(nrows = nrows, ncols = ncols, figsize = (ncols * 6, nrows * 4)) fig.subplots_adjust(wspace = 0.25, hspace = 0.5) axes = axes.ravel() for ax, f in zip(axes, feat_names): try: zscore, pvalue = stats.skewtest(df[f].dropna()) if kind is 'density': df[f].dropna().plot(kind = kind, ax = ax, rot = 90) else: _ = ax.hist(df[f].dropna(), bins = bins) ax.set_title("zscore=%.2g, pvalue=%.2g" % (zscore, pvalue)) ax.set_xlabel(f) except: pass
def sampleIsNormal(self, sample): samplesOK = True if not sample: return False for i in range(len(self.columnNames)): if self.colTypes[i] == Typedef.numeric: array = np.array([float(entry[i]) for entry in sample]) if self.removeOutliersCheck: array = self.removeOutliers(array) if len(array) < Typedef.normalUnsafe: samplesOK = False break # If the sample size is greater than 40, by Central Limit Theorem we can # assume the sample to follow normal distribution. if len(array) > Typedef.normalSafe: continue elif len(array) > Typedef.normalUnsafe: # Test for skewness. if st.skewtest(array)[1] < Typedef.skewThreshold: samplesOK = False break # Test for unimodality: skew = st.skew(array) kurtosis = st.kurtosis(array) if (np.square(skew) - kurtosis) > Typedef.unimodalityThreshold: samplesOK = False break return samplesOK
def __init__(self, gene_subset=None): self.gexp = pd.read_csv(self.GEXP_FILE, index_col=0, sep="\t") if gene_subset is not None: self.gexp = self.gexp[self.gexp.index.isin(gene_subset)] self.gexp_genes = self.gexp.median(1).sort_values(ascending=False) self.gexp_genes_std = self.gexp.std(1).sort_values(ascending=False) self.gexp_genes_skew = pd.Series(skewtest(self.gexp.T)[0], index=self.gexp.index) self.cancer_type = pd.read_csv(self.CANCER_TYPE_FILE, sep="\t", header=None, index_col=0)[1] self.cancer_type = self.cancer_type.append( pd.Series({ x: "Normal" for x in self.gexp.columns if x not in self.cancer_type })) colors = (sns.color_palette("tab20c").as_hex() + sns.color_palette("tab20b").as_hex()) self.cancer_type_palette = dict( zip(natsorted(self.cancer_type.value_counts().index), colors))
def print_statistics(data): print("RETURN SAMPLE STATISTICS") print("---------------------------------------------") print("Mean of Daily Log Returns %9.6f" % np.mean(data['returns'])) print("Std of Daily Log Returns %9.6f" % np.std(data['returns'])) print("Mean of Annua. Log Returns %9.6f" % (np.mean(data['returns']) * 252)) print("Std of Annua. Log Returns %9.6f" % \ (np.std(data['returns']) * math.sqrt(252))) print("---------------------------------------------") print("Skew of Sample Log Returns %9.6f" % scs.skew(data['returns'])) print("Skew Normal Test p-value %9.6f" % scs.skewtest(data['returns'])[1]) print("---------------------------------------------") print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns'])) print("Kurt Normal Test p-value %9.6f" % \ scs.kurtosistest(data['returns'])[1]) print("Normal Test p-value %9.6f" % \ scs.normaltest(data['returns'])[1]) print("---------------------------------------------") print("Realized Volatility %9.6f" % data['rea_vol'].iloc[-1]) print("Realized Variance %9.6f" % data['rea_var'].iloc[-1])
def test_skewtest_2D_notmasked(self): # a normal ndarray is passed to the masked function x = np.random.random((20,2))*20. r = stats.skewtest(x) rm = stats.mstats.skewtest(x) assert_equal(r[0][0],rm[0][0]) assert_equal(r[0][1],rm[0][1])
def longTail(direction): """ Examine the shape of this set of samples to see if it matches a gaussian with a long tail to one direction, indicated by the parameter. +1 is a long, older tail and -1 is a long, younger tail """ name = "long tail of " + (direction > 0 and 'older' or 'younger') + ' samples' if len(samples.sampleList) < 8: return SimResult(confidence.Confidence(confidence.Applic.df, confidence.Validity.plaus), name, 'not enough samples to check for tail', 'minimum of 8 samples needed') res = stats.skewtest([sample[__getAge()] for sample in samples.sampleList]) qual = __getQuality(res[1]/2) conf = __getConfidence((-1.5, -1, 0, 1, 1.5), res[0], qual) plot = __getPlot('id', __getAge()) if direction < 0: conf = -conf return SimResult(conf, name, (qual < confidence.Validity.sound and 'weak' or 'strong') + ' evidence of a' + (res[0] < 0 and ' younger' or 'n older') + ' tail found', plot) #should be a plot of my samples and a gaussian
def get_stats(a): """Computes mean, D_T or D_R, and standard error for a list. """ a = np.asarray(a) n = a.shape[-1] keepdims = a.ndim > 1 M = np.nanmean(a, -1, keepdims=keepdims) # c = a - M # variance = np.einsum('...j,...j->...', c, c)/n variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1) SE = np.sqrt(variance)/sqrt(n - 1) SK = skew(a, -1, nan_policy='omit') KU = kurtosis(a, -1, nan_policy='omit') SK_t = skewtest(a, -1, nan_policy='omit') KU_t = kurtosistest(a, -1, nan_policy='omit') if keepdims: SK = SK[..., None] KU = KU[..., None] else: SK = float(SK) KU = float(KU) stat = {'mean': M, 'var': variance, 'std': SE, 'skew': SK, 'skew_test': float(SK_t.statistic), 'kurt': KU, 'kurt_test': float(KU_t.statistic)} print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()]) return stat
def test_for_side_skewness(df_side, lvl=3, hue='value'): from scipy import stats columns = ['statistic', 'p-value'] index_0 = list("lvl_{}".format(i) for i in range(0, lvl)) index_1 = [ "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7', "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml", "rcm" ] index = pd.MultiIndex.from_product([index_0, index_1]) n_row = len(index_0) * len(index_1) n_col = len(columns) data = np.empty((n_row, n_col)) data[:] = np.nan df = pd.DataFrame(data, index=index, columns=columns) for i in range(0, len(index_0)): for v in index_1: i0, i1 = index_0[i], v v_df = get_data_group_by_player_mean(df_side, i, v, hue=hue) df.loc[(i0, i1)] = stats.skewtest(v_df['left'] - v_df['right']) return df
def is_skewed_numerical_feature(self, feature_name): if not self.is_numerical_feature(feature_name): return False skewness, pvalue = stats.skewtest(self.data[feature_name].dropna()) if skewness >= self.params["SKEWNESS_THR"] and pvalue <= 0.01: return True else: return False
def test_skewtest(self): # this test is for 1D data for n in self.get_n(): if n > 8: x, y, xm, ym = self.generate_xy_sample(n) r = stats.skewtest(x) rm = stats.mstats.skewtest(xm) assert_allclose(r[0], rm[0], rtol=1e-15)
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2,-1,0,1,2,3)*4)**2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def doStatTests(X, labels, ml): pca = PCA(n_components=1) pca.fit(X[labels == ml]) XpcaML = pca.transform(X[labels == ml]) labelsOut = labels normXpcaML = (XpcaML - n.mean(XpcaML)) / n.std(XpcaML) #maxKurt = kurtosistest(normXpcaML)[1] #maxSkew = skewtest(normXpcaML)[1] for i in n.unique(labels): if len(X[labels == i]) == 0: continue else: Xpca = pca.transform(X[labels == i]) Xpca = (Xpca - n.mean(Xpca)) / n.std(Xpca) if len(Xpca) < 9: labelsOut[labels == i] = -1 continue if False: if len(Xpca) < 9: labelsOut[labels == i] = -1 continue pl.figure() if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5: tag = 'RFI' else: tag = 'Not RFI' sk = skewtest(Xpca)[1] kt = kurtosistest(Xpca)[1] sk1 = skewtest(XpcaML)[1] kt1 = kurtosistest(XpcaML)[1] pl.subplot(211) pl.hist(Xpca, 50, label=tag + ':' + str(sk) + ':' + str(kt)) pl.legend() pl.subplot(212) pl.hist(XpcaML, 50, label=tag + ':' + str(sk1) + ':' + str(kt1)) pl.legend() pl.show() if i == ml: continue if skewtest(Xpca)[1] > 0.01: #or kurtosistest(Xpca)[1] > 1.: labelsOut[labels == i] = -1 #else: # labelsOut[labels==i] = ml return labelsOut
def test_normalitytests(): # numbers verified with R: dagoTest in package fBasics st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734) pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) x = np.array((-2,-1,0,1,2,3)*4)**2 yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal) yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew) yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
def test_skewtest(self): # this test is for 1D data for n in self.get_n(): if n > 8: x, y, xm, ym = self.generate_xy_sample(n) r = stats.skewtest(x) rm = stats.mstats.skewtest(xm) assert_equal(r[0], rm[0])
def create_scipy_features(base_features, sentinel): r"""Calculate the skew, kurtosis, and other statistical features for each row. Parameters ---------- base_features : numpy array The feature dataframe. sentinel : float The number to be imputed for NaN values. Returns ------- sp_features : numpy array The calculated SciPy features. sp_fnames : list The SciPy feature names. """ logger.info("Creating SciPy Features") # Generate scipy features logger.info("SciPy Feature: geometric mean") row_gmean = sps.gmean(base_features, axis=1) logger.info("SciPy Feature: kurtosis") row_kurtosis = sps.kurtosis(base_features, axis=1) logger.info("SciPy Feature: kurtosis test") row_ktest, pvalue = sps.kurtosistest(base_features, axis=1) logger.info("SciPy Feature: normal test") row_normal, pvalue = sps.normaltest(base_features, axis=1) logger.info("SciPy Feature: skew") row_skew = sps.skew(base_features, axis=1) logger.info("SciPy Feature: skew test") row_stest, pvalue = sps.skewtest(base_features, axis=1) logger.info("SciPy Feature: variation") row_var = sps.variation(base_features, axis=1) logger.info("SciPy Feature: signal-to-noise ratio") row_stn = sps.signaltonoise(base_features, axis=1) logger.info("SciPy Feature: standard error of mean") row_sem = sps.sem(base_features, axis=1) sp_features = np.column_stack( (row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest, row_var, row_stn, row_sem)) sp_features = impute_values(sp_features, 'float64', sentinel) sp_features = StandardScaler().fit_transform(sp_features) # Return new SciPy features logger.info("SciPy Feature Count : %d", sp_features.shape[1]) sp_fnames = [ 'sp_geometric_mean', 'sp_kurtosis', 'sp_kurtosis_test', 'sp_normal_test', 'sp_skew', 'sp_skew_test', 'sp_variation', 'sp_signal_to_noise', 'sp_standard_error_of_mean' ] return sp_features, sp_fnames
def skewness(data: pd.DataFrame, dropna: bool = False): """ Return the skewness of each continuous variable Parameters ---------- data: pd.DataFrame The DataFrame to be described dropna: bool If True, drop rows with NA values before calculating skew. Otherwise the NA values propagate. Returns ------- result: pd.DataFrame DataFrame listing three values for each continuous variable and NA for others: skew, zscore, and pvalue The test null hypothesis is that the skewness of the samples population is the same as the corresponding normal distribution. The pvalue is the two-sided pvalue for the hypothesis test Examples -------- >>> import clarite >>> clarite.describe.skewness(df) Variable type skew zscore pvalue 0 pdias categorical NaN NaN NaN 1 longindex categorical NaN NaN NaN 2 durflow continuous 2.754286 8.183515 2.756827e-16 3 height continuous 0.583514 2.735605 6.226567e-03 4 begflow continuous -0.316648 -1.549449 1.212738e-01 """ # Get continuous variables dtypes = _get_dtypes(data) continuous_idx = dtypes[dtypes == "continuous"].index # Format result df, starting with NA result = pd.DataFrame( data=None, index=dtypes.index, columns=["type", "skew", "zscore", "pvalue"], dtype=float, ) result["type"] = dtypes # Calculate skew and statistical test if dropna: nan_policy = "omit" else: nan_policy = "propagate" result["skew"] = stats.skew(data[continuous_idx], nan_policy=nan_policy) ( result.loc[continuous_idx, "zscore"], result.loc[continuous_idx, "pvalue"], ) = stats.skewtest(data[continuous_idx], nan_policy=nan_policy) # Format result.index.name = "Variable" result = result.reset_index() return result
def test_normalitytests(): # numbers verified with R: dagoTest in package fBasics st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734) pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) x = np.array((-2, -1, 0, 1, 2, 3) * 4)**2 yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal) yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew) yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
def doStatTests(X,labels,ml): pca = PCA(n_components=1) pca.fit(X[labels==ml]) XpcaML = pca.transform(X[labels==ml]) labelsOut = labels normXpcaML = (XpcaML-n.mean(XpcaML))/n.std(XpcaML) #maxKurt = kurtosistest(normXpcaML)[1] #maxSkew = skewtest(normXpcaML)[1] for i in n.unique(labels): if len(X[labels==i])==0: continue else: Xpca = pca.transform(X[labels==i]) Xpca = (Xpca-n.mean(Xpca))/n.std(Xpca) if len(Xpca) < 9: labelsOut[labels==i] = -1 continue if False: if len(Xpca) < 9: labelsOut[labels==i] = -1 continue pl.figure() if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5: tag = 'RFI' else: tag = 'Not RFI' sk = skewtest(Xpca)[1] kt = kurtosistest(Xpca)[1] sk1 = skewtest(XpcaML)[1] kt1 = kurtosistest(XpcaML)[1] pl.subplot(211) pl.hist(Xpca,50,label=tag+':'+str(sk)+':'+str(kt)) pl.legend() pl.subplot(212) pl.hist(XpcaML,50,label=tag+':'+str(sk1)+':'+str(kt1)) pl.legend() pl.show() if i == ml: continue if skewtest(Xpca)[1] > 0.01: #or kurtosistest(Xpca)[1] > 1.: labelsOut[labels==i] = -1 #else: # labelsOut[labels==i] = ml return labelsOut
def normality_stats(arr): """ 统计信息偏度,峰度,正态分布检测,p-value eg: input: 2014-07-25 223.57 2014-07-28 224.82 2014-07-29 225.01 ... 2016-07-22 222.27 2016-07-25 230.01 2016-07-26 225.93 output: array skew = -0.282635248604699 array skew p-value = 0.009884539532576725 array kurt = 0.009313464006726946 array kurt p-value = 0.8403947352953821 array norm = NormaltestResult(statistic=6.6961445106692237, pvalue=0.035152053009441256) array norm p-value = 0.035152053009441256 input: tsla bidu noah sfun goog vips aapl 2014-07-25 223.57 226.50 15.32 12.110 589.02 21.349 97.67 2014-07-28 224.82 225.80 16.13 12.450 590.60 21.548 99.02 2014-07-29 225.01 220.00 16.75 12.220 585.61 21.190 98.38 ... ... ... ... ... ... ... ... 2016-07-22 222.27 160.88 25.50 4.850 742.74 13.510 98.66 2016-07-25 230.01 160.25 25.57 4.790 739.77 13.390 97.34 2016-07-26 225.93 163.09 24.75 4.945 740.92 13.655 97.76 output: array skew = [-0.2826 -0.2544 0.1456 1.0322 0.2095 0.095 0.1719] array skew p-value = [ 0.0099 0.0198 0.1779 0. 0.0539 0.3781 0.1124] array kurt = [ 0.0093 -0.8414 -0.4205 0.4802 -1.547 -0.9203 -1.2104] array kurt p-value = [ 0.8404 0. 0.0201 0.0461 1. 0. 0. ] array norm = NormaltestResult(statistic=array([ 6.6961, 52.85 , 7.2163, 69.0119, 3.7161, 69.3468, 347.229 ]), pvalue=array([ 0.0352, 0. , 0.0271, 0. , 0.156 , 0. , 0. ])) array norm p-value = [ 0.0352 0. 0.0271 0. 0.156 0. 0. ] :param arr: pd.DataFrame or pd.Series or Iterable """ log_func = logging.info if ABuEnv.g_is_ipython else print log_func('array skew = {}'.format(scs.skew(arr))) log_func('array skew p-value = {}'.format(scs.skewtest(arr)[1])) log_func('array kurt = {}'.format(scs.kurtosis(arr))) log_func('array kurt p-value = {}'.format(scs.kurtosistest(arr)[1])) log_func('array norm = {}'.format(scs.normaltest(arr))) log_func('array norm p-value = {}'.format(scs.normaltest(arr)[1]))
def BasicSummary1(series): series_len = len(series) basiclist = [ stats.skew(series), stats.skewtest(series)[1], stats.kurtosis(series), stats.kurtosistest(series)[1], stats.variation(series) ] return np.round(pd.Series(basiclist), decimals=6)
def view_residuals(): PATH = sys.argv[2] pt = pd.read_csv(PATH + '/residuals.txt', delimiter=" ", names=['V_R', 'mu_b', 'mu_l']) g = sns.distplot(pt['V_R'], kde=True, rug=True, bins=100) plt.savefig("VR.png") plt.clf() g = sns.distplot(pt['mu_b'], kde=True, rug=True, bins=100) plt.savefig("mu_b.png") plt.clf() g = sns.distplot(pt['mu_l'], kde=True, rug=True, bins=100) plt.savefig("mu_l.png") plt.clf() print(stat.shapiro(pt['V_R'])) print(stat.skewtest(pt['V_R'])) print(stat.normaltest(pt['V_R'])) print(stat.shapiro(pt['mu_b'])) print(stat.skewtest(pt['mu_b'])) print(stat.normaltest(pt['mu_b'])) print(stat.shapiro(pt['mu_l'])) print(stat.skewtest(pt['mu_l'])) print(stat.normaltest(pt['mu_l'])) pt = pd.read_csv(PATH + '/theta_errs.txt', delimiter=" ", names=['R', 'dT']) g = sns.distplot(pt['dT'], kde=True, rug=True, bins=100) plt.savefig("theta_err.png") plt.clf() pt = pd.read_csv(PATH + '/uni_profile.txt', delimiter=" ", names=['R_0', 'Sigma']) #print(pt) g = sns.lineplot(x="R_0", y="Sigma", data=pt, markers=True, linewidth=1.5, palette="tab10") plt.savefig("profile.png")
def test_skewtest_2D_WithMask(self): nx = 2 for n in self.get_n(): if n > 8: x, y, xm, ym = self.generate_xy_sample2D(n, nx) r = stats.skewtest(x) rm = stats.mstats.skewtest(xm) assert_equal(r[0][0],rm[0][0]) assert_equal(r[0][1],rm[0][1])
def test_skewtest_2D_WithMask(self): nx = 2 for n in self.get_n(): if n > 8: x, y, xm, ym = self.generate_xy_sample2D(n, nx) r = stats.skewtest(x) rm = stats.mstats.skewtest(xm) assert_equal(r[0][0], rm[0][0]) assert_equal(r[0][1], rm[0][1])
def noise(fname, x0 = 100, y0 = 100, maxrad = 30): from astroML.plotting import hist hdulist = pf.open(fname) im = hdulist[0].data #print np.mean(im), np.min(im), np.max(im) #print im[95:105, 95:105] # x0, y0 = 100, 100 xi, yi = np.indices(im.shape) R = np.sqrt( (yi - int(y0))**2. + (xi - int(x0))**2. ) phot_a = np.zeros(maxrad + 1) phot_a[0] = 0 bmasked = im * ((R > maxrad) * (R < maxrad + 20.)) bdata = bmasked.flatten() #print bdata[bdata != 0.] #print len(bdata[bdata != 0.]) #print len(bdata) plt.subplot(3, 1, 1) hist(bdata[bdata != 0.], bins = 'blocks') plt.xlabel('Flux') plt.ylabel('(Bayesian Blocks)') plt.title('Noise') #plt.show() plt.subplot(3, 1, 2) hist(bdata[bdata != 0.], bins = 50) plt.xlabel('Flux') plt.ylabel('(50 bins)') #plt.title('Noise (50 bins)') #plt.show() plt.subplot(3, 1, 3) hist(bdata[bdata != 0.], bins = 'knuth') plt.xlabel('Flux') plt.ylabel('(Knuth\'s Rule)') #plt.title('Noise (Knuth\'s Rule)') plt.show() A2, crit, sig = anderson(bdata[bdata != 0.], dist = 'norm') print 'A-D Statistic:', A2 print ' CVs \t Sig.' print np.vstack((crit, sig)).T normality = normaltest(bdata[bdata != 0.]) print 'Normality:', normality skewness = skewtest(bdata[bdata != 0.]) print 'Skewness:', skewness kurtosis = kurtosistest(bdata[bdata != 0.]) print 'Kurtosis:', kurtosis print 'Mean:', np.mean(bdata[bdata != 0.]) print 'Median:', np.median(bdata[bdata != 0.])
def normality_tests(arr): ''' Tests for normality distribution of given data set. Parameters array: ndarray object to generate on ''' print("Skew of data set %14.3f" % scs.skew(arr)) print("Skew test p-value %14.3f" % scs.skewtest(arr)[1]) print("Kurt of data set %14.3f" % scs.kurtosis(arr)) print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1]) print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
def apply_log(sf): #when to apply log: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3591587/ for column in list(sf.columns): values = sf[column].tolist() skewness = stats.skewtest(values).statistic sf["index"] = sf["index"].replace(0.000000, 0.000001) if (skewness > 1.96) or (skewness < -1.96): sf[column] = sf[column].apply(numpy.log) print("applied log to ", column) return sf
def find_features_skewed(df, skew_thr, feat_names = None): """ Find the features whose values are skewed above skew_thr. Use plot_features_density to inspect the skewness of features to find the threshold. df: DataFrame skew_thr : features with higher skewness will be returned feat_names : subset of features of interest, if None, it chooses from numerical features """ ## find all numerical features by default feat_names = find_numerical_features(df) if feat_names is None else feat_names return np.asarray([f for f in feat_names if stats.skewtest(df[f])[0] >= skew_thr])
def work(self, input_items, output_items): in0 = input_items[0] out = output_items[0] #print in0.shape x = in0.reshape(self.N) rvs = stats.norm.rvs(size=(1024,), loc=0.5, scale=1.5) #print x.shape[0] #print self.buf #print x.imag [D1,p1] = stats.skewtest(x.imag) [D2,p2] = stats.skewtest(x.real) #print D #print p if p1 < 0.05 and p2 < 0.05: print ('Not Gaussian, p is ', p1, " ",p2 , 'at sample', self.nitems_read(0)) self.ctr=self.ctr+1 print self.ctr self.buf=np.copy(x) out[:] = in0 return len(output_items[0])
def histo(self, s, x, y, bins=20): # 绘制单个变量的直方图 df = self.data[s] skewnes, sk = stats.skewtest(df) kurtosis, ku = stats.kurtosistest(df) sns.set(style="darkgrid") pc = sns.distplot(df, kde=True, bins=bins) plt.text(x=x, y=y, s='skewnes=%.2f\nkurtosis=%.2f' % (skewnes, kurtosis)) name = 'the Histograme of {:s}'.format(s.capitalize()) plt.suptitle(name) return pc
def test_vs_nonmasked(self): x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x)) assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x)) assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x)) funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] x = [1, 2, 3, 4] for func, mfunc in zip(funcs, mfuncs): assert_raises(ValueError, func, x) assert_raises(ValueError, mfunc, x)
def normality_test(arr): ''' Robust normality test based on skewness, kurtosis, and normality :param arr: obj to generate statistics on ''' print("Skew of data set %14.3f" % scs.skew(arr)) print("Skew test p-value %14.3f" % scs.skewtest(arr)[1]) print("Kurt of sata set %14.3f" % scs.kurtosis(arr)) print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1]) print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
def normality_test(arr): '''Tests for normality distribution of givven data set. Parameters ========== array: ndarray object to generates statistics on ''' print 'Skew of data set %14.3f' %scs.skew(arr) print 'Skew test p value %14.3f' %scs.skewtest(arr)[1] print 'Kurt of data set %14.3f' %scs.kurtosis(arr) print 'Kurt test p value %14.3f' %scs.kurtosistest(arr)[1] print 'Normal test p value %14.3f' %scs.normaltest(arr)[1]
def stats_on_list_of_sizes(db_lens, assemb_lens): """function to perform stats on two lists of seq lens. Returns as a tab separeated string: as_skew, db_skew, ttest, Man_u_value, Man_p_value""" as_skew = ('normal skewtest assemb_lens = %6.3f pvalue = %6.4f' % stats.skewtest(assemb_lens)) db_skew = ('normal skewtest db_lens = %6.3f pvalue = %6.4f' % stats.skewtest(db_lens)) ttest = ('t-statistic = %6.3f pvalue = %6.4f' % stats.ttest_ind(db_lens, assemb_lens)) Man_u_value, Man_p_value = mannwhitneyu(db_lens, assemb_lens, alternative="two-sided") outdata = "\t".join([as_skew, db_skew, ttest, str(Man_u_value), str(Man_p_value)]) return outdata
def find_skewed_features(self): """ 1. it is a numerical feature 2. its max_value / min_value >= 20 -- not accurate 3. or optionally, use the skewness test in "scipy.stats.skewtest" """ skewed_feats = [] for f in self.find_numerical_features(): xs = self.data[f].dropna() try: skewness, pvalue = stats.skewtest(xs) if skewness >= self.skewness_thr and pvalue <= 0.01: skewed_feats.append(f) except: if xs.max() * 1. / xs.min() >= self.skewness_thr: skewed_feats.append(f) return np.asarray(skewed_feats)
def findBestCluster(self, clusters): id = 0 minScore = 4.5 DONE = False minDistance = self.globalMinDist mindiffScore = np.inf prevAvg = np.average(self.prevCluster, axis=0) #print("Nclusters",nclusters) for i,clust in enumerate(clusters): #print len(clust) if len(clust)>=8: score = np.linalg.norm(stats.skewtest(clust)[0]) var = np.var(clust, axis=0) avg = np.average(clust, axis=0) dist = np.linalg.norm(prevAvg - avg) if np.isnan(dist): dist = 0 diffScore = math.fabs(var[0] - var[1]) #print("hello") if score < minscore: #print(score, diffScore, avg[1]) if diffScore<mindiffScore and avg[1]>self.h/2 and dist<minDistance: DONE = True minScore = score mindiffScore = diffScore minDistance = dist id = i break if DONE == False: #print("NOT DETECTED") self.globalMinDist += 10 self.notDetected += 1 #return self.prevCluster return [] else: self.globalMinDist -= 50 if self.globalMinDist < 25: self.globalMinDist =25 self.notDetected = 0 #print("FINAL:", mindiffScore, minScore, minDistance) self.prevCluster = clusters[id] return clusters[id]
def print_statistics(data): print "RETURN SAMPLE STATISTICS" print "---------------------------------------------" print "Mean of Daily Log Returns %9.6f" % np.mean(data['returns']) print "Std of Daily Log Returns %9.6f" % np.std(data['returns']) print "Mean of Annua. Log Returns %9.6f" % (np.mean(data['returns']) * 252) print "Std of Annua. Log Returns %9.6f" % \ (np.std(data['returns']) * math.sqrt(252)) print "---------------------------------------------" print "Skew of Sample Log Returns %9.6f" % scs.skew(data['returns']) print "Skew Normal Test p-value %9.6f" % scs.skewtest(data['returns'])[1] print "---------------------------------------------" print "Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns']) print "Kurt Normal Test p-value %9.6f" % \ scs.kurtosistest(data['returns'])[1] print "---------------------------------------------" print "Normal Test p-value %9.6f" % \ scs.normaltest(data['returns'])[1] print "---------------------------------------------" print "Realized Volatility %9.6f" % data['rea_vol'].iloc[-1] print "Realized Variance %9.6f" % data['rea_var'].iloc[-1]
def normality_check(feature_group,group_name): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values[{}] ------------------------------------ Kurtosis | {} Skewness | {} NormalTest | {} ''' result = temp.format(group_name,kr_test[1],sk_test[1],normaltest[1]) tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05) return result,tests
def plot_features_density(df, feat_names = None, plot_type="density", bins = 30): """ Plot the density of feature values df: DataFrame feat_names: feature of interest, by default all numerical features plot_type: {"density", "hist"} """ ## numerical features feat_names = find_numerical_features(df) if feat_names is None else feat_names nrows, ncols = feat_names.shape[0] / 3 + 1, 3 fig, axes = plt.subplots(nrows = nrows, ncols = ncols, figsize = (ncols * 6, nrows * 4)) fig.subplots_adjust(wspace = 0.25, hspace = 0.5) axes = axes.ravel() for ax, f in zip(axes, feat_names): try: zscore, pvalue = stats.skewtest(df[f]) if plot_type is 'density': df[f].plot(kind = plot_type, ax = ax, rot = 90) else: _ = ax.hist(df[f], bins = bins) ax.set_title("zscore=%.2g, pvalue=%.2g" % (zscore, pvalue)) ax.set_xlabel(f) except: pass
def test_skewtest_2D_notmasked(self): # a normal ndarray is passed to the masked function x = np.random.random((20, 2)) * 20. r = stats.skewtest(x) rm = stats.mstats.skewtest(x) assert_allclose(np.asarray(r), np.asarray(rm))
def main(): def n_digits(num): if num <= 1: return 1 return math.ceil(math.log(num) / math.log(10)) db = sqlite.connect(db_fn) dbc = db.cursor() rows = [] integer_digits = {'best': 0, 'best_time': 0, 'mean': 0, 'stddev': 0} allvals = [] allvals_dict = {} for variant in VARIANTS: query = ("select tw from (select min(treewidth) as tw from validationresults where variant='%(variant)s' and instance='%(instance)s' group by seed)") result = dbc.execute(query % {'variant': variant, 'instance': instance}) vals = NP.array([row[0] for row in result]) min, mean, stddev = vals.min(), vals.mean(), vals.std() # print('%s: vals=%r' % (variant, vals), file=sys.stderr) W, p = STATS.shapiro(vals) print('%s: normal distribution? shapiro-wilk: W=%s (p=%s) %s@5%% %s@2%%' % (variant, W, p, 'no' if W <= .905 else 'yes', 'no' if W <= .884 else 'yes'), file=sys.stderr) z, p = STATS.skewtest(vals) print('%s: normal distribution? skew test: (z=%s) p=%s => %s' % (variant, z, p, 'no' if p < .5 else 'yes'), file=sys.stderr) allvals.append(vals) allvals_dict[variant] = vals query = ("select min(runtime_s)" " from validationresults" " where variant='%(variant)s' and instance='%(instance)s' and treewidth='%(treewidth)s'") result = dbc.execute(query % {'variant': variant, 'instance': instance, 'treewidth': min}) best_time = [row[0] for row in result][0] # print("%s: best=%s @ %ss, avg=%s +- %s" % (variant, min, best_time, mean, stddev), file=sys.stderr) row = {'variant': variant, 'best': min, 'best_time': round(best_time, 1), 'mean': round(mean, 1), 'stddev': round(stddev, 1)} rows.append(row) integer_digits['best'] = max(integer_digits['best'], n_digits(row['best'])) integer_digits['best_time'] = max(integer_digits['best_time'], n_digits(row['best_time'])) integer_digits['mean'] = max(integer_digits['mean'], n_digits(row['mean'])) integer_digits['stddev'] = max(integer_digits['stddev'], n_digits(row['stddev'])) db.close() T, p = STATS.bartlett(*allvals) print('equal variances? bartlett: T=%s (p=%s) [vs Chi-Quadrat_{k-1=%s, alpha=.5}]' % (T, p, len(allvals) - 1), file=sys.stderr) W, p = STATS.levene(*allvals, center='mean') print('equal variances? levene (mean): (W=%s) p=%s' % (W, p), file=sys.stderr) W, p = STATS.levene(*allvals, center='median') print('equal variances? levene (median): (W=%s) p=%s' % (W, p), file=sys.stderr) F, p = STATS.f_oneway(*allvals) print('equal means? one-way ANOVA: F=%s, p=%s [vs F_{k-1=%s,n-k=%s}]' % (F, p, len(allvals) - 1, sum([len(x) for x in allvals]) - len(allvals)), file=sys.stderr) try: W, p = STATS.kruskal(*allvals) print('equal means? kruskal wallis: W=%s, p=%s' % (W, p), file=sys.stderr) except Exception as e: print(e) lsd = LSD.LSD(allvals, .05) print('LSD: %r' % lsd, file=sys.stderr) print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.10), file=sys.stderr) print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.05), file=sys.stderr) def welch(var1, var2): res = STATS.ttest_ind(allvals_dict[var1], allvals_dict[var2], equal_var=False) print('%4s vs %s t,p=%r => \t%s @a=10%%, %s @a=5%%' % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr) print('pairwise Welch\'s t-test with Bonferroni correction:', file=sys.stderr) welch('IHA', 'MA1') welch('IHA', 'MA2') welch('IHA', 'MA3') welch('GAtw', 'MA1') welch('GAtw', 'MA2') welch('GAtw', 'MA3') welch('MA1', 'MA2') welch('MA1', 'MA3') welch('MA2', 'MA3') def mannwhitneyu(var1, var2): try: res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2]) print('%4s vs %s u,p=%r => \t%s @a=10%%, %s @a=5%%' % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr) except Exception as e: print('%4s vs %s failed: %r' % (var1, var2, e)) print('pairwise Mann-Whitney U test with Bonferroni correction:', file=sys.stderr) mannwhitneyu('IHA', 'MA1') mannwhitneyu('IHA', 'MA2') mannwhitneyu('IHA', 'MA3') mannwhitneyu('GAtw', 'MA1') mannwhitneyu('GAtw', 'MA2') mannwhitneyu('GAtw', 'MA3') mannwhitneyu('MA1', 'MA2') mannwhitneyu('MA1', 'MA3') mannwhitneyu('MA2', 'MA3') #latex = [r'\begin{sidefigure}{caption={Results for instance \Instance{%(instanceTexEsc)s}},label={fig:%(instanceFileEsc)s-results},place={htbp}}''\n' #r' \begin{center}''\n' latex = [r'\begin{table}[hbtp]''\n' r' \caption{Results for instance \Instance{%(instanceTexEsc)s}}''\n' r' \label{fig:%(instanceFileEsc)s-results}''\n' r' \centering\small''\n' r' \begin{tabular}{l S[table-format=%(best)s] S[table-format=%(best_time)s.1]%%''\n' r' S[table-format=%(mean)s.1,table-number-alignment=right] @{$\,\pm\,$} S[table-format=%(stddev)s.1,table-number-alignment=left]''\n' r' S[table-format=2]} \toprule''\n' r' & \multicolumn{2}{c}{\header{Best}} & \multicolumn{2}{c}{\header{Average}} & \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}''\n' r' & \header{treewidth} & \header{seconds} & \multicolumn{2}{c}{\header{treewidth}} & \header{samples} \\ \midrule' % dict(integer_digits.items() | dict(instanceTexEsc=instance.replace('_', r'\textunderscore{}'), instanceFileEsc=instance.replace('_', '-')).items())] for row in rows: latex.append(' ' * (3 * 3) + ' & '.join([row['variant'], str(row['best']), str(row['best_time']), str(row['mean']), str(row['stddev']), "20"]) + r'\\') latex.append(r' \bottomrule''\n' r' \end{tabular}''\n' r'\end{table}') #r' \end{center}''\n' #r'\end{sidefigure}') with open('validation-validationset-%s-results.tex' % instance.replace('_', '-'), 'w') as f: print('\n'.join(latex), file=f)
def skewtest(s): if len(s)<10: return 1.0 return stats.skewtest(s)[0]
############################################################################### # Distribution statistics # Stats on normal distribution mean_flux = np.mean(interface_fluxes) # print('Mean Flux:', mean_flux) median_flux = np.median(interface_fluxes) # print('Median Flux:', median_flux) stdev_flux = np.std(interface_fluxes) # print('Std Dev Flux:', stdev_flux) skewness = stats.skew(abs(interface_fluxes)) # print('skewness:', skewness) z_score, p_value = stats.skewtest(abs(interface_fluxes)) # print('z-score:', z_score) # Stats on lognormal distribution interface_fluxes_log = -np.log(abs(interface_fluxes)) mean_flux_log = np.mean(interface_fluxes_log) # print('Mean ln(Flux):', mean_flux_log) median_flux_log = np.median(interface_fluxes_log) # print('Median ln(Flux):', median_flux_log) stdev_flux_log = np.std(interface_fluxes_log) # print('Std Dev ln(Flux):', stdev_flux_log) skewness_log = stats.skew(interface_fluxes_log)
def BasicSummary1(series): series_len = len(series) basiclist=[stats.skew(series), stats.skewtest(series)[1], stats.kurtosis(series),stats.kurtosistest(series)[1],stats.variation(series)] return np.round(pd.Series(basiclist),decimals=6)
def skewnessp(self, x): return skewtest(x)[1]
def skewness(self, x): return skewtest(x)[0]