Пример #1
0
    def summary_diff_dtype(x):
        if x.dtype.name in ['object', 'bool', 'category'
                            ] and len(x.unique()) <= max_lev:
            vc = x.value_counts(dropna=False, normalize=True)
            s = ''
            for name, v in zip(vc.index, vc.values):
                s += f'{name} {v*100:>2.0f}%'
                s += '<br>' if in_cell_next_line else ', '
            return s[:-2]
        elif x.dtype.name in ['float64', 'int64']:
            o = f'quantiles: {x.quantile(q=[0, 0.25, 0.5, 0.75, 1]).values.tolist()}{in_cell_next} \
                mean: {x.mean():.2f}\
                std: {x.std():.2f} \
                cv: {x.std()/x.mean():.2f}{in_cell_next}\
                skew: {skew(x[x.notnull()]):.2f}'

            if sum(x.notnull()) > 8:  # requirement of skewtest
                p = skewtest(x[x.notnull()]).pvalue
                o += f'*' if p <= 0.05 else ''
                if min(x[x != 0]) > 0 and len(x[x != 0]) > 8:  # take log
                    o += f'{in_cell_next}log skew: {skew(np.log(x[x>0])):.2f}'
                    p = skewtest(np.log(x[x != 0])).pvalue
                    o += f'*' if p != p and p <= 0.05 else ''
            return o
        elif 'datetime' in x.dtype.name:
            # o = ''
            qs = x.quantile(q=[0, 0.25, 0.5, 0.75, 1]).values
            return print_list([np.datetime_as_string(q)[0:16] for q in qs],
                              br=in_cell_next)
        else:
            return ''
def stats_on_list_of_sizes(in_list1, in_list2):
    """function to perform stats on two lists of seq lens.
    Returns as a tab separeated string:
    as_skew,
    in_list1_skew,
    ttest,
    Man_u_value,
    Man_p_value"""
    in_list1_skew = ('normal skewtest in_list1 = %6.3f pvalue = %6.4f' %
                     stats.skewtest(in_list1))
    in_list2_skew = ('normal skewtest in_list2 = %6.3f pvalue = %6.4f' %
                     stats.skewtest(in_list2))
    ttest = ('t-statistic = %6.3f pvalue = %6.4f' %
             stats.ttest_ind(in_list1, in_list2))
    Man_u_value, Man_p_value = mannwhitneyu(in_list1,
                                            in_list2,
                                            alternative="two-sided")
    outdata = "\t".join([
        in_list1_skew, in_list2_skew, ttest,
        str(Man_u_value),
        str(Man_p_value)
    ])
    skew_t = "\t".join([
        "in_list1_skew: %s" % in_list1_skew,
        "in_list2_skew: %s" % in_list2_skew,
        "Mann_whitney U test P value: %s" % Man_p_value
    ])
    return skew_t
Пример #3
0
def test_omni_normtest():
    #tests against R fBasics
    from scipy import stats
    st_pv_R = np.array(
              [[3.994138321207883, -1.129304302161460,  1.648881473704978],
               [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]])

    nt = omni_normtest(x)
    assert_almost_equal(nt, st_pv_R[:,0], 14)

    st = stats.skewtest(x)
    assert_almost_equal(st, st_pv_R[:,1], 14)

    kt = stats.kurtosistest(x)
    assert_almost_equal(kt, st_pv_R[:,2], 11)

    st_pv_R = np.array(
              [[34.523210399523926,  4.429509162503833,  3.860396220444025],
               [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]])

    x2 = x**2
    #TODO: fix precision in these test with relative tolerance
    nt = omni_normtest(x2)
    assert_almost_equal(nt, st_pv_R[:,0], 12)

    st = stats.skewtest(x2)
    assert_almost_equal(st, st_pv_R[:,1], 12)

    kt = stats.kurtosistest(x2)
    assert_almost_equal(kt, st_pv_R[:,2], 12)
Пример #4
0
def test_omni_normtest():
    #tests against R fBasics
    from scipy import stats

    st_pv_R = np.array(
              [[3.994138321207883, -1.129304302161460,  1.648881473704978],
               [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]])

    nt = omni_normtest(x)
    assert_almost_equal(nt, st_pv_R[:, 0], 14)

    st = stats.skewtest(x)
    assert_almost_equal(st, st_pv_R[:, 1], 14)

    kt = stats.kurtosistest(x)
    assert_almost_equal(kt, st_pv_R[:, 2], 11)

    st_pv_R = np.array(
              [[34.523210399523926,  4.429509162503833,  3.860396220444025],
               [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]])

    x2 = x**2
    #TODO: fix precision in these test with relative tolerance
    nt = omni_normtest(x2)
    assert_almost_equal(nt, st_pv_R[:, 0], 12)

    st = stats.skewtest(x2)
    assert_almost_equal(st, st_pv_R[:, 1], 12)

    kt = stats.kurtosistest(x2)
    assert_almost_equal(kt, st_pv_R[:, 2], 12)
Пример #5
0
def pd_reducer_ratios(pt, nan_policy='raise', **kwargs):
    '''
    Takes a pandas dataframe with s_date, price & research columns
    Top tip: use the _rs_to_ptbl(recordset) to convert your query (with the
    fields s_date, s_type and s_val) into a valid pandas pivot table :)
    input = [
        {'date':'2017-01-01', 'x':12.1, 'y':22  },
        {'date':'2017-01-02', 'x':13.7, 'y':32.2},
        {'date':'2017-01-03', 'x':11.7, 'y':12.8},
        ]
    Returns a dict.
    '''
    # axis==1 is column and axis==0 is row for all pandas operations requiring
    fn = "pandas_reducer_ratios"
    reducer_suite = __name__.split('.')[-1]

    try:
        from scipy import stats
        import pandas as pd
    except:
        raise ImportError('{} needs pandas and scipy')

    # Enforce our default in case it gets out of hand.
    if nan_policy not in nan_policies:
        raise AttributeError( \
            'nan_policy {} not accepted - try omit, raise or propagate'.format(
                nan_policy)
        )

    output = {}

    output['prx_mean'] = pt.price.mean()
    output['prx_kurtosis_st'] = stats.kurtosistest(pt.price,
                                                   nan_policy=nan_policy)[0]
    output['prx_kurtosis_pv'] = stats.kurtosistest(pt.price,
                                                   nan_policy=nan_policy)[1]
    output['prx_skewtest_st'] = stats.skewtest(pt.price,
                                               nan_policy=nan_policy)[0]
    output['prx_skewtest_pv'] = stats.skewtest(pt.price,
                                               nan_policy=nan_policy)[1]
    output['prx_corr'] = pt.price.corr(pt.research)
    output['rsch_mean'] = pt.research.mean()
    output['rsch_kurtosis_st'] = stats.kurtosistest(pt.research,
                                                    nan_policy=nan_policy)[0]
    output['rsch_kurtosis_pv'] = stats.kurtosistest(pt.research,
                                                    nan_policy=nan_policy)[1]
    output['rsch_skewtest_st'] = stats.skewtest(pt.research,
                                                nan_policy=nan_policy)[0]
    output['rsch_skewtest_pv'] = stats.skewtest(pt.research,
                                                nan_policy=nan_policy)[1]
    output['rsch_corr'] = pt.research.corr(pt.price)

    kur_ratio = float(output['prx_kurtosis_st']) / output['rsch_kurtosis_st']

    output['kurtosis_ratios'] = kur_ratio
    return output
Пример #6
0
def apply_log(sf):
    #when to apply log: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3591587/
    for column in list(sf.columns):
        values = sf[column].tolist()
        skewness = stats.skewtest(values).statistic
        if (skewness > 1.96) or (skewness < -1.96):
            #print(skewness)
            sf[column] = sf[column].apply(numpy.log)
            #print("applied log to ", column)
            values = sf[column].tolist()
            skewness = stats.skewtest(values).statistic
            #print("now it is ", skewness)
    return sf
Пример #7
0
    def compute_alpha(self, column):
        """Find the best alpha attainable with the given parameters"""
        lower = -1 * column.min()
        pval_prev = 0.0
        pval = skewtest(np.log(column + lower + self.incr)).pvalue
        i = 1
        
        while i <= self.max_iter and pval > pval_prev:
            i += 1
            pval_prev = pval
            pval = skewtest(np.log(column + lower + i * self.incr)).pvalue

        if pval_prev > skewtest(column).pvalue: 
            return lower + (i - 1) * self.incr
        return None
Пример #8
0
def get_stats(a):
    """Computes mean, D_T or D_R, and standard error for a list.
    """
    a = np.asarray(a)
    n = a.shape[-1]
    keepdims = a.ndim > 1
    M = np.nanmean(a, -1, keepdims=keepdims)
    # c = a - M
    # variance = np.einsum('...j,...j->...', c, c)/n
    variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1)
    SE = np.sqrt(variance) / sqrt(n - 1)
    SK = skew(a, -1, nan_policy='omit')
    KU = kurtosis(a, -1, nan_policy='omit')
    SK_t = skewtest(a, -1, nan_policy='omit')
    KU_t = kurtosistest(a, -1, nan_policy='omit')
    if keepdims:
        SK = SK[..., None]
        KU = KU[..., None]
    else:
        SK = float(SK)
        KU = float(KU)
    stat = {
        'mean': M,
        'var': variance,
        'std': SE,
        'skew': SK,
        'skew_test': float(SK_t.statistic),
        'kurt': KU,
        'kurt_test': float(KU_t.statistic)
    }
    print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()])
    return stat
Пример #9
0
 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
     xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
Пример #10
0
def normality_check(feature_group, output_path):

    if feature_group.isEmpty():
        return False

    normal_flag = True
    sk_test = stats.skewtest(feature_group.get_scores())
    kr_test = stats.kurtosistest(feature_group.get_scores())
    normaltest = stats.normaltest(feature_group.get_scores())

    temp = '''

			Normality Test P-Values
		------------------------------------
		 Kurtosis   |  {0}
		 Skewness   |  {1}
		 NormalTest |  {2}


	'''

    result = temp.format(kr_test[1], sk_test[1], normaltest[1])

    print result

    tests = (sk_test[1] > 0.05, kr_test[1] > 0.05, normaltest[1] > 0.05)

    return tests
def stats_on_list_of_sizes(vals):
    """function to perform stats on a list 
    skew,
    """
    skew = ('normal skewtest vals = %6.3f pvalue = %6.4f' %
            stats.skewtest(vals))
    return skew
Пример #12
0
def create_bins(df, attribute):
    """ This function defines the bins that are going to be used to categorise
    the numerical variables. 
    
    It takes as inputs:
    @df: the dataframe that contain the variable to be processed
    @attribute: is the name of the variable
    
    ---------------------------------------------------------------------------
    
    In particular, it is built on two steps. The first one provide the 
    computation of the skewness of the distribution of the attribute without 
    taking into account those samples that don't lie in the IQR, hence the 
    plausible outliers. 
    
    In general the skewness of an attribute is an in indicator of the simmetry 
    of its distibution. Whether it is a positive value there is more weight in
    left tail of the distribution, otherwise (negative values) the weight is in 
    the right tail. 
    
    The Skew Test is performed to check whether the Skew is significally 
    different from 0. Precisely:
    
    H0: the skew of the distribution the data are drawn from is equal to that
        of the normal distribution (equal to 0).
    
    ---------------------------------------------------------------------------    
    
    The result of the test determines the way in which the bins are created. In
    particular, whether the Skew is significally different from zero,
    the method used to create the bins is the Doane, a particular estimator
    which takes into account the skew of the data. 
    Otherwise the Auto method is used to estimate the bins.    
    
    (a brief description of these estimators is avaiable in this documentation: 
    https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html).
    
    ------------------------------------------------------------------------"""
        
        
    # Get the end points of the IQR
    B = plt.boxplot(df[attribute])
    plt.close()
    min_max = [item.get_ydata()[1] for item in B['whiskers']]

    # Perform the statistical test
    skew_pvalue = skewtest(df[attribute][df[attribute] >= min_max[0]])[1]
    
    # Whether significally different from zero
    if skew_pvalue < 0.05:
        # Use the Doane method
        bins = np.histogram(df[attribute], bins = 'doane')[1]
        bins_interval = [(bins[i], bins[i+1]) for i in range(len(bins)-1)]
    # Otherwise
    else:
        # Use the auto method
        bins = np.histogram(df[attribute], bins = 'auto')[1]
        bins_interval = [(bins[i], bins[i+1]) for i in range(len(bins)-1)]
    
    return bins_interval
Пример #13
0
def normality_check(feature_group,output_path):

	if feature_group.isEmpty():
		return False

	
	normal_flag = True
	sk_test = stats.skewtest(feature_group.get_scores())
	kr_test = stats.kurtosistest(feature_group.get_scores()) 
	normaltest = stats.normaltest(feature_group.get_scores())

	temp = '''

			Normality Test P-Values
		------------------------------------
		 Kurtosis   |  {0}
		 Skewness   |  {1}
		 NormalTest |  {2}


	'''

	result = temp.format(kr_test[1],sk_test[1],normaltest[1])

	print result


	tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05)

	return tests
Пример #14
0
def skewness(_: pathlib.Path, data: pd.DataFrame):
    statistic, pvalue = skewtest(data)
    return pd.DataFrame({
        "skew": skew(data),
        "statistic": statistic,
        "pvalue": pvalue
    })
Пример #15
0
def print_market_information(benchmark):
    print("RETURN BENCHMARK STATISTICS")
    print("---------------------------------------------")
    print("Mean of Daily  Log Returns %9.6f" % np.mean(benchmark['returns']))
    print("Std  of Daily  Log Returns %9.6f" % np.std(benchmark['returns']))
    print("Mean of Annua. Log Returns %9.6f" %
          (np.mean(benchmark['returns']) * 252))
    print("Std  of Annua. Log Returns %9.6f" %
          (np.std(benchmark['returns']) * math.sqrt(252)))
    print("---------------------------------------------")
    print("Skew of Sample Log Returns %9.6f" % scs.skew(benchmark['returns']))
    print("Skew Normal Test p-value   %9.6f" %
          scs.skewtest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Kurt of Sample Log Returns %9.6f" %
          scs.kurtosis(benchmark['returns']))
    print("Kurt Normal Test p-value   %9.6f" %
          scs.kurtosistest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Normal Test p-value        %9.6f" %
          scs.normaltest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Anderson Normality Test:		   ")
    print(stats.anderson(benchmark['returns']))
    return
Пример #16
0
	def plot_feature_density(self, feat_names = None, kind="density", bins = 30):
		"""
		Plot the density of feature values 
		df: DataFrame
		feat_names: feature of interest, by default all numerical features 
		kind: {"density", "hist"}
		"""
		## numerical features
		feat_names = self.find_numerical_features() if feat_names is None else np.asarray(feat_names)
		df = self.data.loc[:, feat_names]
		nrows, ncols = int(math.ceil(feat_names.shape[0] / 3.)), 3
		fig, axes = plt.subplots(nrows = nrows, ncols = ncols, figsize = (ncols * 6, nrows * 4))
		fig.subplots_adjust(wspace = 0.25, hspace = 0.5)
		axes = axes.ravel()
		for ax, f in zip(axes, feat_names):
			try:
				zscore, pvalue = stats.skewtest(df[f].dropna())
				if kind is 'density':
					df[f].dropna().plot(kind = kind, ax = ax, rot = 90)
				else:
					_ = ax.hist(df[f].dropna(), bins = bins)
				ax.set_title("zscore=%.2g, pvalue=%.2g" % (zscore, pvalue))
				ax.set_xlabel(f)
			except:
				pass
Пример #17
0
 def sampleIsNormal(self, sample):
     samplesOK = True
     if not sample:
         return False
     for i in range(len(self.columnNames)):
         if self.colTypes[i] == Typedef.numeric:
             array = np.array([float(entry[i]) for entry in sample])
             if self.removeOutliersCheck:
                 array = self.removeOutliers(array)
             if len(array) < Typedef.normalUnsafe:
                 samplesOK = False
                 break
         # If the sample size is greater than 40, by Central Limit Theorem we can
         # assume the sample to follow normal distribution.
             if len(array) > Typedef.normalSafe:
                 continue
             elif len(array) > Typedef.normalUnsafe:
             # Test for skewness.
                 if st.skewtest(array)[1] < Typedef.skewThreshold:
                     samplesOK = False
                     break
             # Test for unimodality:
                 skew = st.skew(array)
                 kurtosis = st.kurtosis(array)
                 if (np.square(skew) - kurtosis) > Typedef.unimodalityThreshold:
                     samplesOK = False
                     break
     return samplesOK
Пример #18
0
    def __init__(self, gene_subset=None):
        self.gexp = pd.read_csv(self.GEXP_FILE, index_col=0, sep="\t")

        if gene_subset is not None:
            self.gexp = self.gexp[self.gexp.index.isin(gene_subset)]

        self.gexp_genes = self.gexp.median(1).sort_values(ascending=False)
        self.gexp_genes_std = self.gexp.std(1).sort_values(ascending=False)
        self.gexp_genes_skew = pd.Series(skewtest(self.gexp.T)[0],
                                         index=self.gexp.index)

        self.cancer_type = pd.read_csv(self.CANCER_TYPE_FILE,
                                       sep="\t",
                                       header=None,
                                       index_col=0)[1]
        self.cancer_type = self.cancer_type.append(
            pd.Series({
                x: "Normal"
                for x in self.gexp.columns if x not in self.cancer_type
            }))

        colors = (sns.color_palette("tab20c").as_hex() +
                  sns.color_palette("tab20b").as_hex())
        self.cancer_type_palette = dict(
            zip(natsorted(self.cancer_type.value_counts().index), colors))
def print_statistics(data):
    print("RETURN SAMPLE STATISTICS")
    print("---------------------------------------------")
    print("Mean of Daily  Log Returns %9.6f" % np.mean(data['returns']))
    print("Std  of Daily  Log Returns %9.6f" % np.std(data['returns']))
    print("Mean of Annua. Log Returns %9.6f" %
          (np.mean(data['returns']) * 252))
    print("Std  of Annua. Log Returns %9.6f" % \
          (np.std(data['returns']) * math.sqrt(252)))

    print("---------------------------------------------")
    print("Skew of Sample Log Returns %9.6f" % scs.skew(data['returns']))
    print("Skew Normal Test p-value   %9.6f" %
          scs.skewtest(data['returns'])[1])
    print("---------------------------------------------")
    print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns']))
    print("Kurt Normal Test p-value   %9.6f" % \
          scs.kurtosistest(data['returns'])[1])

    print("Normal Test p-value        %9.6f" % \
          scs.normaltest(data['returns'])[1])
    print("---------------------------------------------")

    print("Realized Volatility        %9.6f" % data['rea_vol'].iloc[-1])
    print("Realized Variance          %9.6f" % data['rea_var'].iloc[-1])
Пример #20
0
 def test_skewtest_2D_notmasked(self):
     # a normal ndarray is passed to the masked function
     x = np.random.random((20,2))*20.
     r = stats.skewtest(x)
     rm = stats.mstats.skewtest(x)
     assert_equal(r[0][0],rm[0][0])
     assert_equal(r[0][1],rm[0][1])
Пример #21
0
def longTail(direction):
    """
    Examine the shape of this set of samples to see if it matches a gaussian with a long tail
    to one direction, indicated by the parameter. +1 is a long, older tail and -1 is a long,
    younger tail
    """
    
    name = "long tail of " + (direction > 0 and 'older' or 'younger') + ' samples'
    
    if len(samples.sampleList) < 8:
        return SimResult(confidence.Confidence(confidence.Applic.df, confidence.Validity.plaus),
                     name, 'not enough samples to check for tail', 'minimum of 8 samples needed')
    
    res = stats.skewtest([sample[__getAge()] for sample in samples.sampleList])
    qual = __getQuality(res[1]/2)
    conf = __getConfidence((-1.5, -1, 0, 1, 1.5), res[0], qual)
    
    plot = __getPlot('id', __getAge())
    
    if direction < 0:
        conf = -conf
    
    return SimResult(conf, name, (qual < confidence.Validity.sound and 'weak' or 'strong') +
                     ' evidence of a' + (res[0] < 0 and ' younger' or 'n older') + ' tail found',
                     plot) #should be a plot of my samples and a gaussian
Пример #22
0
def get_stats(a):
    """Computes mean, D_T or D_R, and standard error for a list.
    """
    a = np.asarray(a)
    n = a.shape[-1]
    keepdims = a.ndim > 1
    M = np.nanmean(a, -1, keepdims=keepdims)
    # c = a - M
    # variance = np.einsum('...j,...j->...', c, c)/n
    variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1)
    SE = np.sqrt(variance)/sqrt(n - 1)
    SK = skew(a, -1, nan_policy='omit')
    KU = kurtosis(a, -1, nan_policy='omit')
    SK_t = skewtest(a, -1, nan_policy='omit')
    KU_t = kurtosistest(a, -1, nan_policy='omit')
    if keepdims:
        SK = SK[..., None]
        KU = KU[..., None]
    else:
        SK = float(SK)
        KU = float(KU)
    stat = {'mean': M, 'var': variance, 'std': SE,
            'skew': SK, 'skew_test': float(SK_t.statistic),
            'kurt': KU, 'kurt_test': float(KU_t.statistic)}
    print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()])
    return stat
Пример #23
0
def test_for_side_skewness(df_side, lvl=3, hue='value'):
    from scipy import stats

    columns = ['statistic', 'p-value']

    index_0 = list("lvl_{}".format(i) for i in range(0, lvl))
    index_1 = [
        "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7',
        "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml",
        "rcm"
    ]

    index = pd.MultiIndex.from_product([index_0, index_1])

    n_row = len(index_0) * len(index_1)
    n_col = len(columns)

    data = np.empty((n_row, n_col))
    data[:] = np.nan

    df = pd.DataFrame(data, index=index, columns=columns)

    for i in range(0, len(index_0)):
        for v in index_1:
            i0, i1 = index_0[i], v
            v_df = get_data_group_by_player_mean(df_side, i, v, hue=hue)
            df.loc[(i0, i1)] = stats.skewtest(v_df['left'] - v_df['right'])

    return df
	def is_skewed_numerical_feature(self, feature_name):
		if not self.is_numerical_feature(feature_name):
			return False 
		skewness, pvalue = stats.skewtest(self.data[feature_name].dropna())
		if skewness >= self.params["SKEWNESS_THR"] and pvalue <= 0.01:
			return True
		else:
			return False 
Пример #25
0
 def test_skewtest(self):
     # this test is for 1D data
     for n in self.get_n():
         if n > 8:
             x, y, xm, ym = self.generate_xy_sample(n)
             r = stats.skewtest(x)
             rm = stats.mstats.skewtest(xm)
             assert_allclose(r[0], rm[0], rtol=1e-15)
Пример #26
0
 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2,-1,0,1,2,3)*4)**2
     xm = np.ma.array(np.r_[np.inf, x, 10],
                      mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
Пример #27
0
def doStatTests(X, labels, ml):
    pca = PCA(n_components=1)
    pca.fit(X[labels == ml])
    XpcaML = pca.transform(X[labels == ml])
    labelsOut = labels
    normXpcaML = (XpcaML - n.mean(XpcaML)) / n.std(XpcaML)
    #maxKurt = kurtosistest(normXpcaML)[1]
    #maxSkew = skewtest(normXpcaML)[1]
    for i in n.unique(labels):
        if len(X[labels == i]) == 0:
            continue
        else:
            Xpca = pca.transform(X[labels == i])
            Xpca = (Xpca - n.mean(Xpca)) / n.std(Xpca)
            if len(Xpca) < 9:
                labelsOut[labels == i] = -1
                continue
            if False:
                if len(Xpca) < 9:
                    labelsOut[labels == i] = -1
                    continue
                pl.figure()
                if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5:
                    tag = 'RFI'
                else:
                    tag = 'Not RFI'
                sk = skewtest(Xpca)[1]
                kt = kurtosistest(Xpca)[1]
                sk1 = skewtest(XpcaML)[1]
                kt1 = kurtosistest(XpcaML)[1]
                pl.subplot(211)
                pl.hist(Xpca, 50, label=tag + ':' + str(sk) + ':' + str(kt))
                pl.legend()
                pl.subplot(212)
                pl.hist(XpcaML,
                        50,
                        label=tag + ':' + str(sk1) + ':' + str(kt1))
                pl.legend()
                pl.show()
            if i == ml:
                continue
            if skewtest(Xpca)[1] > 0.01:  #or kurtosistest(Xpca)[1] > 1.:
                labelsOut[labels == i] = -1
            #else:
            #    labelsOut[labels==i] = ml
    return labelsOut
Пример #28
0
def test_normalitytests():
    # numbers verified with R: dagoTest in package fBasics
    st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734)
    pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502,  0.98880019)
    x = np.array((-2,-1,0,1,2,3)*4)**2
    yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal)
    yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew)
    yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
Пример #29
0
 def test_skewtest(self):
     # this test is for 1D data
     for n in self.get_n():
         if n > 8:
             x, y, xm, ym = self.generate_xy_sample(n)
             r = stats.skewtest(x)
             rm = stats.mstats.skewtest(xm)
             assert_equal(r[0], rm[0])
Пример #30
0
def create_scipy_features(base_features, sentinel):
    r"""Calculate the skew, kurtosis, and other statistical features
    for each row.

    Parameters
    ----------
    base_features : numpy array
        The feature dataframe.
    sentinel : float
        The number to be imputed for NaN values.

    Returns
    -------
    sp_features : numpy array
        The calculated SciPy features.
    sp_fnames : list
        The SciPy feature names.

    """

    logger.info("Creating SciPy Features")

    # Generate scipy features

    logger.info("SciPy Feature: geometric mean")
    row_gmean = sps.gmean(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis")
    row_kurtosis = sps.kurtosis(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis test")
    row_ktest, pvalue = sps.kurtosistest(base_features, axis=1)
    logger.info("SciPy Feature: normal test")
    row_normal, pvalue = sps.normaltest(base_features, axis=1)
    logger.info("SciPy Feature: skew")
    row_skew = sps.skew(base_features, axis=1)
    logger.info("SciPy Feature: skew test")
    row_stest, pvalue = sps.skewtest(base_features, axis=1)
    logger.info("SciPy Feature: variation")
    row_var = sps.variation(base_features, axis=1)
    logger.info("SciPy Feature: signal-to-noise ratio")
    row_stn = sps.signaltonoise(base_features, axis=1)
    logger.info("SciPy Feature: standard error of mean")
    row_sem = sps.sem(base_features, axis=1)

    sp_features = np.column_stack(
        (row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest,
         row_var, row_stn, row_sem))
    sp_features = impute_values(sp_features, 'float64', sentinel)
    sp_features = StandardScaler().fit_transform(sp_features)

    # Return new SciPy features

    logger.info("SciPy Feature Count : %d", sp_features.shape[1])
    sp_fnames = [
        'sp_geometric_mean', 'sp_kurtosis', 'sp_kurtosis_test',
        'sp_normal_test', 'sp_skew', 'sp_skew_test', 'sp_variation',
        'sp_signal_to_noise', 'sp_standard_error_of_mean'
    ]
    return sp_features, sp_fnames
Пример #31
0
def skewness(data: pd.DataFrame, dropna: bool = False):
    """
    Return the skewness of each continuous variable

    Parameters
    ----------
    data: pd.DataFrame
        The DataFrame to be described
    dropna: bool
        If True, drop rows with NA values before calculating skew.  Otherwise the NA values propagate.

    Returns
    -------
    result: pd.DataFrame
        DataFrame listing three values for each continuous variable and NA for others: skew, zscore, and pvalue
        The test null hypothesis is that the skewness of the samples population is the same as the corresponding
        normal distribution.  The pvalue is the two-sided pvalue for the hypothesis test

    Examples
    --------
    >>> import clarite
    >>> clarite.describe.skewness(df)
         Variable         type      skew    zscore        pvalue
    0       pdias  categorical       NaN       NaN           NaN
    1   longindex  categorical       NaN       NaN           NaN
    2     durflow   continuous  2.754286  8.183515  2.756827e-16
    3      height   continuous  0.583514  2.735605  6.226567e-03
    4     begflow   continuous -0.316648 -1.549449  1.212738e-01
    """
    # Get continuous variables
    dtypes = _get_dtypes(data)
    continuous_idx = dtypes[dtypes == "continuous"].index

    # Format result df, starting with NA
    result = pd.DataFrame(
        data=None,
        index=dtypes.index,
        columns=["type", "skew", "zscore", "pvalue"],
        dtype=float,
    )
    result["type"] = dtypes

    # Calculate skew and statistical test
    if dropna:
        nan_policy = "omit"
    else:
        nan_policy = "propagate"
    result["skew"] = stats.skew(data[continuous_idx], nan_policy=nan_policy)
    (
        result.loc[continuous_idx, "zscore"],
        result.loc[continuous_idx, "pvalue"],
    ) = stats.skewtest(data[continuous_idx], nan_policy=nan_policy)

    # Format
    result.index.name = "Variable"
    result = result.reset_index()
    return result
Пример #32
0
def test_normalitytests():
    # numbers verified with R: dagoTest in package fBasics
    st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734)
    pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019)
    x = np.array((-2, -1, 0, 1, 2, 3) * 4)**2
    yield assert_array_almost_equal, stats.normaltest(x), (st_normal,
                                                           pv_normal)
    yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew)
    yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
Пример #33
0
def doStatTests(X,labels,ml):
    pca = PCA(n_components=1)
    pca.fit(X[labels==ml])
    XpcaML = pca.transform(X[labels==ml])
    labelsOut = labels
    normXpcaML = (XpcaML-n.mean(XpcaML))/n.std(XpcaML)
    #maxKurt = kurtosistest(normXpcaML)[1]
    #maxSkew = skewtest(normXpcaML)[1]
    for i in n.unique(labels):
        if len(X[labels==i])==0:
            continue
        else:
            Xpca = pca.transform(X[labels==i])
            Xpca = (Xpca-n.mean(Xpca))/n.std(Xpca)
            if len(Xpca) < 9:
                labelsOut[labels==i] = -1
                continue
            if False:
                if len(Xpca) < 9:
                    labelsOut[labels==i] = -1
                    continue
                pl.figure()
                if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5:
                    tag = 'RFI'
                else:
                    tag = 'Not RFI'
                sk = skewtest(Xpca)[1]
                kt = kurtosistest(Xpca)[1]
                sk1 = skewtest(XpcaML)[1]
                kt1 = kurtosistest(XpcaML)[1]
                pl.subplot(211)
                pl.hist(Xpca,50,label=tag+':'+str(sk)+':'+str(kt))
                pl.legend()
                pl.subplot(212)
                pl.hist(XpcaML,50,label=tag+':'+str(sk1)+':'+str(kt1))
                pl.legend()
                pl.show()
            if i == ml:
                continue
            if skewtest(Xpca)[1] > 0.01: #or kurtosistest(Xpca)[1] > 1.:
                labelsOut[labels==i] = -1
            #else:
            #    labelsOut[labels==i] = ml
    return labelsOut
Пример #34
0
def normality_stats(arr):
    """
    统计信息偏度,峰度,正态分布检测,p-value
        eg:
                input:

                2014-07-25    223.57
                2014-07-28    224.82
                2014-07-29    225.01
                               ...
                2016-07-22    222.27
                2016-07-25    230.01
                2016-07-26    225.93

                output:

                array skew = -0.282635248604699
                array skew p-value = 0.009884539532576725
                array kurt = 0.009313464006726946
                array kurt p-value = 0.8403947352953821
                array norm = NormaltestResult(statistic=6.6961445106692237, pvalue=0.035152053009441256)
                array norm p-value = 0.035152053009441256

                input:

                            tsla	bidu	noah	sfun	goog	vips	aapl
                2014-07-25	223.57	226.50	15.32	12.110	589.02	21.349	97.67
                2014-07-28	224.82	225.80	16.13	12.450	590.60	21.548	99.02
                2014-07-29	225.01	220.00	16.75	12.220	585.61	21.190	98.38
                ...	...	...	...	...	...	...	...
                2016-07-22	222.27	160.88	25.50	4.850	742.74	13.510	98.66
                2016-07-25	230.01	160.25	25.57	4.790	739.77	13.390	97.34
                2016-07-26	225.93	163.09	24.75	4.945	740.92	13.655	97.76

                output:

                array skew = [-0.2826 -0.2544  0.1456  1.0322  0.2095  0.095   0.1719]
                array skew p-value = [ 0.0099  0.0198  0.1779  0.      0.0539  0.3781  0.1124]
                array kurt = [ 0.0093 -0.8414 -0.4205  0.4802 -1.547  -0.9203 -1.2104]
                array kurt p-value = [ 0.8404  0.      0.0201  0.0461  1.      0.      0.    ]
                array norm = NormaltestResult(statistic=array([   6.6961,   52.85  ,    7.2163,   69.0119,    3.7161,
                69.3468, 347.229 ]), pvalue=array([ 0.0352,  0.    ,  0.0271,  0.    ,  0.156 ,  0.    ,  0.    ]))
                array norm p-value = [ 0.0352  0.      0.0271  0.      0.156   0.      0.    ]

    :param arr: pd.DataFrame or pd.Series or Iterable
    """
    log_func = logging.info if ABuEnv.g_is_ipython else print

    log_func('array skew = {}'.format(scs.skew(arr)))
    log_func('array skew p-value = {}'.format(scs.skewtest(arr)[1]))

    log_func('array kurt = {}'.format(scs.kurtosis(arr)))
    log_func('array kurt p-value = {}'.format(scs.kurtosistest(arr)[1]))

    log_func('array norm = {}'.format(scs.normaltest(arr)))
    log_func('array norm p-value = {}'.format(scs.normaltest(arr)[1]))
Пример #35
0
def BasicSummary1(series):
    series_len = len(series)
    basiclist = [
        stats.skew(series),
        stats.skewtest(series)[1],
        stats.kurtosis(series),
        stats.kurtosistest(series)[1],
        stats.variation(series)
    ]
    return np.round(pd.Series(basiclist), decimals=6)
Пример #36
0
def view_residuals():
    PATH = sys.argv[2]
    pt = pd.read_csv(PATH + '/residuals.txt',
                     delimiter=" ",
                     names=['V_R', 'mu_b', 'mu_l'])
    g = sns.distplot(pt['V_R'], kde=True, rug=True, bins=100)
    plt.savefig("VR.png")
    plt.clf()
    g = sns.distplot(pt['mu_b'], kde=True, rug=True, bins=100)
    plt.savefig("mu_b.png")
    plt.clf()
    g = sns.distplot(pt['mu_l'], kde=True, rug=True, bins=100)
    plt.savefig("mu_l.png")
    plt.clf()

    print(stat.shapiro(pt['V_R']))
    print(stat.skewtest(pt['V_R']))
    print(stat.normaltest(pt['V_R']))
    print(stat.shapiro(pt['mu_b']))
    print(stat.skewtest(pt['mu_b']))
    print(stat.normaltest(pt['mu_b']))
    print(stat.shapiro(pt['mu_l']))
    print(stat.skewtest(pt['mu_l']))
    print(stat.normaltest(pt['mu_l']))

    pt = pd.read_csv(PATH + '/theta_errs.txt',
                     delimiter=" ",
                     names=['R', 'dT'])
    g = sns.distplot(pt['dT'], kde=True, rug=True, bins=100)
    plt.savefig("theta_err.png")
    plt.clf()

    pt = pd.read_csv(PATH + '/uni_profile.txt',
                     delimiter=" ",
                     names=['R_0', 'Sigma'])
    #print(pt)
    g = sns.lineplot(x="R_0",
                     y="Sigma",
                     data=pt,
                     markers=True,
                     linewidth=1.5,
                     palette="tab10")
    plt.savefig("profile.png")
Пример #37
0
    def test_skewtest_2D_WithMask(self):
        nx = 2
        for n in self.get_n():
            if n > 8:
                x, y, xm, ym = self.generate_xy_sample2D(n, nx)
                r = stats.skewtest(x)
                rm = stats.mstats.skewtest(xm)

                assert_equal(r[0][0],rm[0][0])
                assert_equal(r[0][1],rm[0][1])
Пример #38
0
    def test_skewtest_2D_WithMask(self):
        nx = 2
        for n in self.get_n():
            if n > 8:
                x, y, xm, ym = self.generate_xy_sample2D(n, nx)
                r = stats.skewtest(x)
                rm = stats.mstats.skewtest(xm)

                assert_equal(r[0][0], rm[0][0])
                assert_equal(r[0][1], rm[0][1])
Пример #39
0
def noise(fname, x0 = 100, y0 = 100, maxrad = 30):
    from astroML.plotting import hist
    hdulist = pf.open(fname)
    im = hdulist[0].data
    #print np.mean(im), np.min(im), np.max(im)
    #print im[95:105, 95:105]
    # x0, y0 = 100, 100
    xi, yi = np.indices(im.shape)
    R = np.sqrt( (yi - int(y0))**2. + (xi - int(x0))**2. )
    phot_a = np.zeros(maxrad + 1)
    phot_a[0] = 0
    
    bmasked = im * ((R > maxrad) * (R < maxrad + 20.))
    bdata = bmasked.flatten()
    #print bdata[bdata != 0.]
    #print len(bdata[bdata != 0.])
    #print len(bdata)
    
    plt.subplot(3, 1, 1)
    hist(bdata[bdata != 0.], bins = 'blocks')
    plt.xlabel('Flux')
    plt.ylabel('(Bayesian Blocks)')
    plt.title('Noise')
    #plt.show()
    
    plt.subplot(3, 1, 2)
    hist(bdata[bdata != 0.], bins = 50)
    plt.xlabel('Flux')
    plt.ylabel('(50 bins)')
    #plt.title('Noise (50 bins)')
    #plt.show()
    
    plt.subplot(3, 1, 3)
    hist(bdata[bdata != 0.], bins = 'knuth')
    plt.xlabel('Flux')
    plt.ylabel('(Knuth\'s Rule)')
    #plt.title('Noise (Knuth\'s Rule)')
    plt.show()
    
    A2, crit, sig = anderson(bdata[bdata != 0.], dist = 'norm')
    print 'A-D Statistic:', A2
    print ' CVs \t  Sig.'
    print np.vstack((crit, sig)).T

    normality = normaltest(bdata[bdata != 0.])
    print 'Normality:', normality

    skewness = skewtest(bdata[bdata != 0.])
    print 'Skewness:', skewness

    kurtosis = kurtosistest(bdata[bdata != 0.])
    print 'Kurtosis:', kurtosis

    print 'Mean:', np.mean(bdata[bdata != 0.])
    print 'Median:', np.median(bdata[bdata != 0.])
Пример #40
0
def normality_tests(arr):
    '''
  Tests for normality distribution of given data set.
  Parameters array: ndarray
  object to generate on
  '''
    print("Skew of data set %14.3f" % scs.skew(arr))
    print("Skew test p-value %14.3f" % scs.skewtest(arr)[1])
    print("Kurt of data set %14.3f" % scs.kurtosis(arr))
    print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1])
    print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
Пример #41
0
def apply_log(sf):
    #when to apply log: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3591587/
    for column in list(sf.columns):
        values = sf[column].tolist()
        skewness = stats.skewtest(values).statistic
        sf["index"] = sf["index"].replace(0.000000, 0.000001)

        if (skewness > 1.96) or (skewness < -1.96):
            sf[column] = sf[column].apply(numpy.log)
            print("applied log to ", column)
    return sf
def find_features_skewed(df, skew_thr, feat_names = None):
	"""
	Find the features whose values are skewed above skew_thr. Use plot_features_density to 
	inspect the skewness of features to find the threshold. 
	df: DataFrame
	skew_thr : features with higher skewness will be returned
	feat_names : subset of features of interest, if None, it chooses from numerical features 
	"""
	## find all numerical features by default
	feat_names = find_numerical_features(df) if feat_names is None else feat_names
	return np.asarray([f for f in feat_names if stats.skewtest(df[f])[0] >= skew_thr])
    def work(self, input_items, output_items):
            in0 = input_items[0]
            out = output_items[0]
            #print in0.shape
            x = in0.reshape(self.N)
            rvs = stats.norm.rvs(size=(1024,), loc=0.5, scale=1.5)
            #print x.shape[0]
            #print self.buf
            #print x.imag
            [D1,p1] = stats.skewtest(x.imag)
            [D2,p2] = stats.skewtest(x.real)
            #print D
            #print p
            if p1 < 0.05 and p2 < 0.05:
                print ('Not Gaussian, p is ', p1, " ",p2 , 'at sample', self.nitems_read(0))
                self.ctr=self.ctr+1
                print self.ctr

            self.buf=np.copy(x)
            out[:] = in0
            return len(output_items[0])
Пример #44
0
 def histo(self, s, x, y, bins=20):  # 绘制单个变量的直方图
     df = self.data[s]
     skewnes, sk = stats.skewtest(df)
     kurtosis, ku = stats.kurtosistest(df)
     sns.set(style="darkgrid")
     pc = sns.distplot(df, kde=True, bins=bins)
     plt.text(x=x,
              y=y,
              s='skewnes=%.2f\nkurtosis=%.2f' % (skewnes, kurtosis))
     name = 'the Histograme of {:s}'.format(s.capitalize())
     plt.suptitle(name)
     return pc
Пример #45
0
    def test_vs_nonmasked(self):
        x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
        assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x))
        assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x))
        assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x))

        funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest]
        mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]
        x = [1, 2, 3, 4]
        for func, mfunc in zip(funcs, mfuncs):
            assert_raises(ValueError, func, x)
            assert_raises(ValueError, mfunc, x)
Пример #46
0
def normality_test(arr):
    '''
    Robust normality test based on skewness, kurtosis, and normality

    :param arr: obj to generate statistics on
    '''

    print("Skew of data set  %14.3f" % scs.skew(arr))
    print("Skew test p-value %14.3f" % scs.skewtest(arr)[1])
    print("Kurt of sata set  %14.3f" % scs.kurtosis(arr))
    print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1])
    print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
Пример #47
0
def normality_test(arr):
    '''Tests for normality distribution of givven data set.
    
    Parameters
    ==========
    array: ndarray object to generates statistics on
    '''
    
    print 'Skew of data set %14.3f' %scs.skew(arr)
    print 'Skew test p value %14.3f' %scs.skewtest(arr)[1]
    print 'Kurt of data set %14.3f' %scs.kurtosis(arr)
    print 'Kurt test p value %14.3f' %scs.kurtosistest(arr)[1]
    print 'Normal test p value %14.3f' %scs.normaltest(arr)[1]
Пример #48
0
def stats_on_list_of_sizes(db_lens, assemb_lens):
    """function to perform stats on two lists of seq lens.
    Returns as a tab separeated string:
    as_skew,
    db_skew,
    ttest,
    Man_u_value,
    Man_p_value"""
    as_skew = ('normal skewtest assemb_lens = %6.3f pvalue = %6.4f' %
               stats.skewtest(assemb_lens))
    db_skew = ('normal skewtest db_lens = %6.3f pvalue = %6.4f' %
               stats.skewtest(db_lens))
    ttest = ('t-statistic = %6.3f pvalue = %6.4f' %
             stats.ttest_ind(db_lens, assemb_lens))
    Man_u_value, Man_p_value = mannwhitneyu(db_lens, assemb_lens,
                                            alternative="two-sided")
    outdata = "\t".join([as_skew,
                         db_skew,
                         ttest,
                         str(Man_u_value),
                         str(Man_p_value)])
    return outdata
Пример #49
0
	def find_skewed_features(self):
		"""
		1. it is a numerical feature 
		2. its max_value / min_value >= 20 -- not accurate
		3. or optionally, use the skewness test in "scipy.stats.skewtest"
		"""
		skewed_feats = []
		for f in self.find_numerical_features():
			xs = self.data[f].dropna()
			try:
				skewness, pvalue = stats.skewtest(xs)
				if skewness >= self.skewness_thr and pvalue <= 0.01:
					skewed_feats.append(f)
			except:
				if xs.max() * 1. / xs.min() >= self.skewness_thr:
					skewed_feats.append(f)
		return np.asarray(skewed_feats)
Пример #50
0
    def findBestCluster(self, clusters):
        id = 0 
        minScore = 4.5 
        DONE = False
        minDistance = self.globalMinDist
        mindiffScore = np.inf 
        prevAvg = np.average(self.prevCluster, axis=0) 
        #print("Nclusters",nclusters)
        for i,clust in enumerate(clusters):
            #print len(clust)
            if len(clust)>=8:
                score = np.linalg.norm(stats.skewtest(clust)[0])
                var = np.var(clust, axis=0)
                avg = np.average(clust, axis=0)
                dist = np.linalg.norm(prevAvg - avg)
                if np.isnan(dist):
                    dist = 0 
                diffScore = math.fabs(var[0] - var[1])
                #print("hello")
                if score < minscore:
                    #print(score, diffScore, avg[1])
                    if diffScore<mindiffScore and avg[1]>self.h/2 and dist<minDistance:
                        DONE = True
                        minScore = score
                        mindiffScore = diffScore
                        minDistance = dist
                        id = i
                        break

        if DONE == False:
            #print("NOT DETECTED")
            self.globalMinDist += 10
            self.notDetected += 1
            #return self.prevCluster
            return []
        else:
            self.globalMinDist -= 50
            if self.globalMinDist < 25:
                self.globalMinDist =25 
            self.notDetected = 0
            #print("FINAL:", mindiffScore, minScore, minDistance)
            self.prevCluster = clusters[id]
            return clusters[id]
Пример #51
0
def print_statistics(data):
    print "RETURN SAMPLE STATISTICS"
    print "---------------------------------------------"
    print "Mean of Daily  Log Returns %9.6f" % np.mean(data['returns'])
    print "Std  of Daily  Log Returns %9.6f" % np.std(data['returns'])
    print "Mean of Annua. Log Returns %9.6f" % (np.mean(data['returns']) * 252)
    print "Std  of Annua. Log Returns %9.6f" % \
                (np.std(data['returns']) * math.sqrt(252))
    print "---------------------------------------------"
    print "Skew of Sample Log Returns %9.6f" % scs.skew(data['returns'])
    print "Skew Normal Test p-value   %9.6f" % scs.skewtest(data['returns'])[1]
    print "---------------------------------------------"
    print "Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns'])
    print "Kurt Normal Test p-value   %9.6f" % \
                scs.kurtosistest(data['returns'])[1]
    print "---------------------------------------------"
    print "Normal Test p-value        %9.6f" % \
                scs.normaltest(data['returns'])[1]
    print "---------------------------------------------"
    print "Realized Volatility        %9.6f" % data['rea_vol'].iloc[-1]
    print "Realized Variance          %9.6f" % data['rea_var'].iloc[-1]
Пример #52
0
def normality_check(feature_group,group_name):

	if feature_group.isEmpty():
		return False

	
	normal_flag = True
	sk_test = stats.skewtest(feature_group.get_scores())
	kr_test = stats.kurtosistest(feature_group.get_scores()) 
	normaltest = stats.normaltest(feature_group.get_scores())

	temp = '''
  Normality Test P-Values[{}]
------------------------------------
Kurtosis   |  {}
Skewness   |  {}
NormalTest |  {}
	'''

	result = temp.format(group_name,kr_test[1],sk_test[1],normaltest[1])
	tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05)

	return result,tests
def plot_features_density(df, feat_names = None, plot_type="density", bins = 30):
	"""
	Plot the density of feature values 
	df: DataFrame
	feat_names: feature of interest, by default all numerical features 
	plot_type: {"density", "hist"}
	"""
	## numerical features
	feat_names = find_numerical_features(df) if feat_names is None else feat_names
	nrows, ncols = feat_names.shape[0] / 3 + 1, 3
	fig, axes = plt.subplots(nrows = nrows, ncols = ncols, figsize = (ncols * 6, nrows * 4))
	fig.subplots_adjust(wspace = 0.25, hspace = 0.5)
	axes = axes.ravel()
	for ax, f in zip(axes, feat_names):
		try:
			zscore, pvalue = stats.skewtest(df[f])
			if plot_type is 'density':
				df[f].plot(kind = plot_type, ax = ax, rot = 90)
			else:
				_ = ax.hist(df[f], bins = bins)
			ax.set_title("zscore=%.2g, pvalue=%.2g" % (zscore, pvalue))
			ax.set_xlabel(f)
		except:
			pass
Пример #54
0
 def test_skewtest_2D_notmasked(self):
     # a normal ndarray is passed to the masked function
     x = np.random.random((20, 2)) * 20.
     r = stats.skewtest(x)
     rm = stats.mstats.skewtest(x)
     assert_allclose(np.asarray(r), np.asarray(rm))
Пример #55
0
def main():
    def n_digits(num):
        if num <= 1:
            return 1
        return math.ceil(math.log(num) / math.log(10))

    db = sqlite.connect(db_fn)
    dbc = db.cursor()
    rows = []
    integer_digits = {'best': 0,
                      'best_time': 0,
                      'mean': 0,
                      'stddev': 0}
    allvals = []
    allvals_dict = {}
    for variant in VARIANTS:
        query = ("select tw from (select min(treewidth) as tw from validationresults where variant='%(variant)s' and instance='%(instance)s' group by seed)")
        result = dbc.execute(query % {'variant': variant, 'instance': instance})
        vals = NP.array([row[0] for row in result])
        min, mean, stddev = vals.min(), vals.mean(), vals.std()
        # print('%s: vals=%r' % (variant, vals), file=sys.stderr)
        W, p = STATS.shapiro(vals)
        print('%s: normal distribution? shapiro-wilk: W=%s (p=%s) %s@5%% %s@2%%' % (variant, W, p, 'no' if W <= .905 else 'yes', 'no' if W <= .884 else 'yes'), file=sys.stderr)
        z, p = STATS.skewtest(vals)
        print('%s: normal distribution? skew test: (z=%s) p=%s => %s' % (variant, z, p, 'no' if p < .5 else 'yes'), file=sys.stderr)
        allvals.append(vals)
        allvals_dict[variant] = vals

        query = ("select min(runtime_s)"
                 " from validationresults"
                 " where variant='%(variant)s' and instance='%(instance)s' and treewidth='%(treewidth)s'")
        result = dbc.execute(query % {'variant': variant, 'instance': instance, 'treewidth': min})
        best_time = [row[0] for row in result][0]
        # print("%s: best=%s @ %ss, avg=%s +- %s" % (variant, min, best_time, mean, stddev), file=sys.stderr)
        row = {'variant': variant,
               'best': min,
               'best_time': round(best_time, 1),
               'mean': round(mean, 1),
               'stddev': round(stddev, 1)}
        rows.append(row)
        integer_digits['best'] = max(integer_digits['best'], n_digits(row['best']))
        integer_digits['best_time'] = max(integer_digits['best_time'], n_digits(row['best_time']))
        integer_digits['mean'] = max(integer_digits['mean'], n_digits(row['mean']))
        integer_digits['stddev'] = max(integer_digits['stddev'], n_digits(row['stddev']))
    db.close()
    T, p = STATS.bartlett(*allvals)
    print('equal variances? bartlett: T=%s (p=%s) [vs Chi-Quadrat_{k-1=%s, alpha=.5}]' % (T, p, len(allvals) - 1), file=sys.stderr)
    W, p = STATS.levene(*allvals, center='mean')
    print('equal variances? levene (mean): (W=%s) p=%s' % (W, p), file=sys.stderr)
    W, p = STATS.levene(*allvals, center='median')
    print('equal variances? levene (median): (W=%s) p=%s' % (W, p), file=sys.stderr)
    F, p = STATS.f_oneway(*allvals)
    print('equal means? one-way ANOVA: F=%s, p=%s [vs F_{k-1=%s,n-k=%s}]' % (F, p, len(allvals) - 1, sum([len(x) for x in allvals]) - len(allvals)), file=sys.stderr)
    try:
        W, p = STATS.kruskal(*allvals)
        print('equal means? kruskal wallis: W=%s, p=%s' % (W, p), file=sys.stderr)
    except Exception as e:
        print(e)
    lsd = LSD.LSD(allvals, .05)
    print('LSD: %r' % lsd, file=sys.stderr)
    print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.10), file=sys.stderr)
    print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.05), file=sys.stderr)

    def welch(var1, var2):
        res = STATS.ttest_ind(allvals_dict[var1], allvals_dict[var2], equal_var=False)
        print('%4s vs %s  t,p=%r => \t%s @a=10%%, %s @a=5%%'
              % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)

    print('pairwise Welch\'s t-test with Bonferroni correction:', file=sys.stderr)
    welch('IHA', 'MA1')
    welch('IHA', 'MA2')
    welch('IHA', 'MA3')
    welch('GAtw', 'MA1')
    welch('GAtw', 'MA2')
    welch('GAtw', 'MA3')
    welch('MA1', 'MA2')
    welch('MA1', 'MA3')
    welch('MA2', 'MA3')

    def mannwhitneyu(var1, var2):
        try:
            res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2])
            print('%4s vs %s  u,p=%r => \t%s @a=10%%, %s @a=5%%'
                  % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)
        except Exception as e:
            print('%4s vs %s  failed: %r' % (var1, var2, e))

    print('pairwise Mann-Whitney U test with Bonferroni correction:', file=sys.stderr)
    mannwhitneyu('IHA', 'MA1')
    mannwhitneyu('IHA', 'MA2')
    mannwhitneyu('IHA', 'MA3')
    mannwhitneyu('GAtw', 'MA1')
    mannwhitneyu('GAtw', 'MA2')
    mannwhitneyu('GAtw', 'MA3')
    mannwhitneyu('MA1', 'MA2')
    mannwhitneyu('MA1', 'MA3')
    mannwhitneyu('MA2', 'MA3')

    #latex = [r'\begin{sidefigure}{caption={Results for instance \Instance{%(instanceTexEsc)s}},label={fig:%(instanceFileEsc)s-results},place={htbp}}''\n'
             #r'   \begin{center}''\n'
    latex = [r'\begin{table}[hbtp]''\n'
             r'   \caption{Results for instance \Instance{%(instanceTexEsc)s}}''\n'
             r'   \label{fig:%(instanceFileEsc)s-results}''\n'
             r'   \centering\small''\n'
             r'      \begin{tabular}{l S[table-format=%(best)s] S[table-format=%(best_time)s.1]%%''\n'
             r'                      S[table-format=%(mean)s.1,table-number-alignment=right] @{$\,\pm\,$} S[table-format=%(stddev)s.1,table-number-alignment=left]''\n'
             r'                      S[table-format=2]} \toprule''\n'
             r'         & \multicolumn{2}{c}{\header{Best}} & \multicolumn{2}{c}{\header{Average}} & \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}''\n'
             r'         & \header{treewidth} & \header{seconds} & \multicolumn{2}{c}{\header{treewidth}} & \header{samples} \\ \midrule'
             % dict(integer_digits.items() | dict(instanceTexEsc=instance.replace('_', r'\textunderscore{}'), instanceFileEsc=instance.replace('_', '-')).items())]
    for row in rows:
        latex.append(' ' * (3 * 3) + ' & '.join([row['variant'], str(row['best']), str(row['best_time']), str(row['mean']), str(row['stddev']), "20"]) + r'\\')
    latex.append(r'         \bottomrule''\n'
                 r'      \end{tabular}''\n'
                 r'\end{table}')
                 #r'   \end{center}''\n'
                 #r'\end{sidefigure}')

    with open('validation-validationset-%s-results.tex' % instance.replace('_', '-'), 'w') as f:
        print('\n'.join(latex), file=f)
Пример #56
0
def skewtest(s):
  if len(s)<10:
    return 1.0
  return stats.skewtest(s)[0]
Пример #57
0
    ###############################################################################
    # Distribution statistics

    # Stats on normal distribution
    mean_flux = np.mean(interface_fluxes)
    # print('Mean Flux:', mean_flux)

    median_flux = np.median(interface_fluxes)
    # print('Median Flux:', median_flux)

    stdev_flux = np.std(interface_fluxes)
    # print('Std Dev Flux:', stdev_flux)

    skewness = stats.skew(abs(interface_fluxes))
    # print('skewness:', skewness)
    z_score, p_value = stats.skewtest(abs(interface_fluxes))
    # print('z-score:', z_score)

    # Stats on lognormal distribution
    interface_fluxes_log = -np.log(abs(interface_fluxes))

    mean_flux_log = np.mean(interface_fluxes_log)
    # print('Mean ln(Flux):', mean_flux_log)

    median_flux_log = np.median(interface_fluxes_log)
    # print('Median ln(Flux):', median_flux_log)

    stdev_flux_log = np.std(interface_fluxes_log)
    # print('Std Dev ln(Flux):', stdev_flux_log)

    skewness_log = stats.skew(interface_fluxes_log)
Пример #58
0
def BasicSummary1(series):
	series_len = len(series)
	basiclist=[stats.skew(series), stats.skewtest(series)[1], stats.kurtosis(series),stats.kurtosistest(series)[1],stats.variation(series)]
	return np.round(pd.Series(basiclist),decimals=6)
 def skewnessp(self, x):
     return skewtest(x)[1]
 def skewness(self, x):
     return skewtest(x)[0]