Exemplos de normaltest em Python, exemplos de scipy.stats.normaltest em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: stats.py Projeto: bioragul/ddg

def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
    '''
    A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
    a variety of metrics.
    :param x_values: A list of values for the X-axis (experimental values).
    :param y_values: A list of values for the X-axis (predicted values).
    :param fcorrect_x_cutoff: See get_xy_dataset_statistics.
    :param fcorrect_y_cutoff: See get_xy_dataset_statistics.
    :param x_fuzzy_range: See get_xy_dataset_statistics.
    :param y_scalar: See get_xy_dataset_statistics.
    :return: A table of statistics.
    '''
    from scipy.stats import pearsonr, spearmanr, normaltest, ks_2samp, kstest, norm
    assert(len(x_values) == len(y_values))
    return dict(
        pearsonr = pearsonr(x_values, y_values),
        spearmanr = spearmanr(x_values, y_values),
        gamma_CC = gamma_CC(x_values, y_values),
        MAE = mae(x_values, y_values),
        normaltestx = normaltest(x_values),
        normaltesty = normaltest(y_values),
        kstestx = kstest(x_values, 'norm'),
        kstesty = kstest(y_values, 'norm'),
        ks_2samp = ks_2samp(x_values, y_values),
        fraction_correct = fraction_correct(x_values, y_values, x_cutoff = fcorrect_x_cutoff, y_cutoff = fcorrect_y_cutoff),
        fraction_correct_fuzzy_linear = fraction_correct_fuzzy_linear(x_values, y_values, x_cutoff = fcorrect_x_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar),
    )

Exemplo n.º 2

0

Exibir arquivo

Arquivo: GraphParse_fanmod.py Projeto: Jason3424/Network-Motif

def motifStats(data,motifSize,degree, usetotal=False):
	
	for corr in ('corr','lcorr','lacorr'):
		motifsNL = findMotifs(data,('NL',corr), motifSize, degree, usetotal)
		motifsMCI = findMotifs(data,('MCI',corr), motifSize, degree, usetotal)
		motifsAD = findMotifs(data,('AD',corr), motifSize, degree, usetotal)
		
		allMotifs = list(set(motifsNL.keys()) | set(motifsAD.keys()) | set(motifsMCI.keys()))
		
		datatype = "Total" if usetotal else "Percent"
		filename = "result2/{}_ks-stats_size-{}_deg-{}.txt".format(corr+datatype,motifSize,degree)
		with open(filename,'w') as f:
			f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}\n".format('ID','MCI','AD','NORM NL','NORM MCI','NORM AD'))
			for key in allMotifs:
				NLdata = motifsNL.get(key,np.zeros(88))
				MCIdata = motifsMCI.get(key,np.zeros(88))
				ADdata = motifsAD.get(key,np.zeros(88))
				KSstatistic, MCIpvalue = stats.ks_2samp(MCIdata,NLdata)
				KSstatistic, ADpvalue = stats.ks_2samp(ADdata,NLdata)
				k2,NLnorm = stats.normaltest(NLdata)
				k2,MCInorm = stats.normaltest(MCIdata)
				k2,ADnorm = stats.normaltest(ADdata)
				if MCIpvalue<0.01 or ADpvalue<0.01:
					line = "*{0:>9}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n"
				else:
					line = "{0:>10}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n"
				f.write(line.format(str(int(key)),MCIpvalue,ADpvalue,NLnorm,MCInorm,ADnorm))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: checkNormality.py Projeto: CeasarSS/books

def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    stats.normaltest(data)

    # Or you can check for normality with Kolmogorov-Smirnov test
    _,pVal = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    if pVal > 0.05:
        print('Data are normally distributed')
    
    return pVal

Exemplo n.º 4

0

Exibir arquivo

Arquivo: notebook_normalDistribution.py Projeto: b-rodrigues/statsintro

def check_normality():
    '''Check if the distribution is normal.'''
    # Are the data normally distributed?
    numData = 100
    data = stats.norm.rvs(myMean, mySD, size=numData)
    stats.normaltest(data)
    _ = stats.probplot(data, plot=plt)
    show()

Exemplo n.º 5

0

Exibir arquivo

Arquivo: RedditDataTool2.py Projeto: Lanigankev/RedditScapeAnalysisTool

	def TestNormality(self,array):
		array = np.array(array)
		print stats.normaltest(array)
		if array[1] <0.2:
			print "Unlikely to be normally distributed"
			return False
		else:
			print "The dataset is likely to be normally distributed"
			return True

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_miner.py Projeto: BaxterEaves/baxcat_cxx

def test_convert_uniform_column_to_normal(miner_df):
    logcf = lambda row, x: norm.logpdf(x[0], 0, 1)
    miner = MInER(miner_df, logcf, ['x_2'], n_models=2, use_mp=False)
    miner.init_models()
    miner.fit(20, 10)

    assert(not np.any(np.isnan(miner._df['x_2'].values)))
    assert(not np.any(np.isnan(miner._df['x_3'].values)))

    assert(normaltest(miner._df['x_2'])[1] > .05)
    assert(normaltest(miner._df['x_3'])[1] < .05)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: new_stats.py Projeto: janesma/sixonix

def determine_significance(mesa1, mesa2):
    """ Determines if two sets of values are statistically significant.

    In the best case, we can determine a normal distribution, and equal
    variance. Once determined we can use the independent t-test function if the
    values are of equal variance.  If we have normal data, but the variance is
    unequal, the welch t-test is used.
    http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test
    http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances

    In the case where we cannot determine normality the mann-whitney u-test is
    desired to be used, but this test is only effective when there are greater
    than 20 samples.
    http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
    """
    # FIXME: Is it possible to determine these things with fewer samples?
    Distribution = Enum('Distribution', 'Normal, Non_normal Unknown')
    normality = Distribution.Normal
    try:
        k2, normal = stats.normaltest(mesa1)
        # FIXME: Unhardcode
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal

        k2, normal = stats.normaltest(mesa2)
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal
    except ValueError:
        normality = Distribution.Unknown

    equal_variance = is_equal_variance(mesa1, mesa2)

    if args.ttest:
        t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, normality == Distribution.Normal,
                "t-test" if equal_variance else "Welch's")
    elif args.mannwhitney:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney")

    if normality == Distribution.Normal:
        error_handler='raise'
        if np.var(mesa1) == 0 and equal_variance:
            error_handler='ignore'
        with np.errstate(divide=error_handler):
            t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, False, "t-test" if equal_variance else "Welch's")
    else:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        flawed = len(mesa1) < 20 or len(mesa2) < 20
        return (p, flawed, "Mann-Whitney")

Exemplo n.º 8

0

Exibir arquivo

Arquivo: ISP_checkNormality.py Projeto: ChengduoZhao/statsintro_python

def check_normality():
    '''Check if the distribution is normal.'''
    
    # Set the parameters
    numData = 1000
    myMean = 0
    mySD = 3
    
    # To get reproducable values, I provide a seed value
    np.random.seed(1234)   
    
    # Generate and show random data
    data = stats.norm.rvs(myMean, mySD, size=numData)
    fewData = data[:100]
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    pFewVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['Omnibus']    = stats.normaltest(data)
    _, pFewVals['Omnibus'] = stats.normaltest(fewData)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk']    = stats.shapiro(data)
    _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)
    
    # Or you can check for normality with Lilliefors-test
    _, pVals['Lilliefors']    = lillifors(data)
    _, pFewVals['Lilliefors'] = lillifors(fewData)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')
    
    print('p-values for all {0} data points: ----------------'.format(len(data)))
    print(pVals)
    print('p-values for the first 100 data points: ----------------')
    print(pFewVals)
    
    if pVals['Omnibus'] > 0.05:
        print('Data are normally distributed')
    # --- >>> STOP stats <<< ---
    
    return pVals['Kolmogorov-Smirnov']

Exemplo n.º 9

0

Exibir arquivo

Arquivo: stats.py Projeto: jorjuato/IORstats

def normalityTests(data):
    arr = N.zeros((data.shape[1]+1,data.shape[0]+1),N.object)
    mergeCTOA = concatenateRT(data.copy(), axis=0)
    mergeCTD  = concatenateRT(data.copy(), axis=1)
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            arr[j,i] = normaltest(data[i,j])
    for i, grp in enumerate(mergeCTOA):
        arr[-1,i] = normaltest(grp)
    for i, grp in enumerate(mergeCTD):
        arr[i,-1] = normaltest(grp)
    arr[-1,-1] = normaltest(N.hstack(data.flatten()))
    return arr

Exemplo n.º 10

0

Exibir arquivo

Arquivo: arima.py Projeto: Judiths/openstack_test

def arima_handler(dta, start, end):
    #dta, x = data.dataHandler('./tmpfile00431',0.5)
    dta = pd.TimeSeries(dta)
    #dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700','2060'))
    dta.index = pd.Index(sm.tsa.datetools.dates_from_range(start,end))
    dta.plot(figsize=(12,8))

    fig = plt.figure(figsize=(12,8))
    ax1 = fig.add_subplot(211)
    fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
    ax2 = fig.add_subplot(212)
    fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2)

    arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit()
    #print(arma_mod20)

    arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit()
    #print(arma_mod30)

    print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)
    print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)

    if arma_mod20.aic < arma_mod30.aic:
        sm.stats.durbin_watson(arma_mod20.resid.values)
        fig = plt.figure(figsize=(12,8))
        ax = fig.add_subplot(111)
        ax = arma_mod20.resid.plot(ax=ax);

        resid = arma_mod20.resid
        stats.normaltest(resid)

        fig = plt.figure(figsize=(12,8))
        ax = fig.add_subplot(111)
        fig = qqplot(resid, line='q', ax=ax, fit=True)

        fig = plt.figure(figsize=(12,8))
        ax1 = fig.add_subplot(211)
        fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
        ax2 = fig.add_subplot(212)
        fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

        r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
        data = np.c_[range(1,41), r[1:], q, p]
        #table = pandas.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
        table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
        #print(table.set_index('lag'))

        predict_sunspots = arma_mod20.predict(str(string.atoi(start)+360),str(string.atoi(end)+5), dynamic=True)
        #print(predict_sunspots)
        return predict_sunspots

Exemplo n.º 11

0

Exibir arquivo

Arquivo: playstation_correlation.py Projeto: steve84/thesis-src

def calc_correlation(fname):    
    reader = csv.reader(open(fname,"rb"),delimiter='\t')
    next(reader)
    x = list(reader)
    data = np.array(x).astype('float')
    
    normal_a = stats.normaltest(data[:,0])[1]
    normal_b = stats.normaltest(data[:,1])[1]
    
    if (normal_a >= 0.05) & (normal_b >= 0.05):
		# both series are normally distributed
        return stats.pearsonr(data[:,0], data[:,1])[0]
    else:
		# not normally distributed
        return stats.spearmanr(data[:,0], data[:,1])[0]

Exemplo n.º 12

0

Exibir arquivo

Arquivo: mlens3.py Projeto: stefanv/MLTP

	def inspect_output_by_filter(self,rez,dat,doplot=False,test=False,
	                             sig_clips=[5, 3, 2], sig_test=[False,False,True]):
		p = rez.values()[0][1]
		myoutput = rez.values()[0][0]
		new  = rez.values()[0][2]
		filt = rez.keys()[0]

		ret = {}
		ret.update({"all": self._extract_info(p,myoutput.sd_beta,myoutput)})
		err = dat[2]
		tmp = (dat[1] - self.modelfunc_small_te(p,dat[0]))/err
		dof = tmp.shape[0] -  myoutput.beta.shape[0]
		chisq = (tmp**2).sum()
		ret['all'].update({"ndata": dat[0].shape[0], \
		                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
		                    "normalcy_prob": normaltest(tmp)[1]})

		for s in enumerate(sig_clips):
			if sig_test[s[0]] and not test:
				continue
			sig = s[1]
			# get the indices of those inside and out of the clip area
			tmpisig = (abs(tmp) < sig).nonzero()[0]
			tmpisige = (abs(tmp) > sig).nonzero()[0]
			frac_less_than_sig =  float(tmpisig.shape[0])/dat[0].shape[0]
			# print frac_less_than_sig
			if frac_less_than_sig < 1.0:
				out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\
 			 					   	  filt,do_sim=False,vplot=False)
				p        = out[1]
				myoutput = out[0]
				t = "-test" if sig_test[s[0]] else ""
					
				ret.update({"sig" + str(sig) + t: self._extract_info(p,myoutput.sd_beta,myoutput)})
				tmp = (dat[1][tmpisig] - self.modelfunc_small_te(p,dat[0][tmpisig]))/err[tmpisig]
				dof = tmp.shape[0] - myoutput.beta.shape[0]
				chisq = (tmp**2).sum()
				try:
					ntest =  normaltest(tmp)[1]
				except:
					ntest = 0.0
				ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \
				                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
				                    "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig })
				if doplot:
					plot(dat[0][tmpisige],dat[1][tmpisige],".")
			
		return ret

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_mstats_basic.py Projeto: jnothman/scipy

 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
     xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))

Exemplo n.º 14

0

Exibir arquivo

Arquivo: functions.py Projeto: koosha/stock-anomaly

def replace_outs(df, numOuts, df_outs_ind):
    """
    This has been replaced with "replace_outs2"
    """
    df_out = df.copy()
    out_row_inds, out_col_inds = np.random.randint(0, len(df_outs_ind.index), numOuts), \
                                 np.random.randint(0, len(df_outs_ind.columns), numOuts)

    for row, col in zip(out_row_inds, out_col_inds):
        array_col = df.iloc[:, col].dropna()
        z_score, p_val = stats.normaltest(array_col)

        if p_val > 0.05:  # this means the distribution is normal
            eps = 0.002 * np.random.random_sample(1) - 0.001  # epsilon is a random float in [-0.001, 0.001]
            # *** this threshold should be set in experiments
            df_out.iloc[row, col] = 3 * df.iloc[:, col].std() + eps
            # print("for row {0} and column {1} we have {2} and real val is {3}".format(row, col, df_out.iloc[row, col], df_in.iloc[row, col]))
            df_outs_ind.iloc[row, col] = 1

        else:
            q1, q3, iqr = tukey_vals(array_col)
            tukeyHL = [array_col.mean() + q3 + (3 * iqr), array_col.mean() - q1 - (3 * iqr)]
            df_out.iloc[row, col] = rnd.sample(tukeyHL, 1)
            df_outs_ind.iloc[row, col] = 1

    return df_out, df_outs_ind

Exemplo n.º 15

0

Exibir arquivo

Arquivo: checkNormality.py Projeto: CeasarSS/books

def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['omnibus'] = stats.normaltest(data)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    
    # Or you can check for normality with Lilliefors-test
    ksStats, pVals['Lilliefors'] = kstest_normal(data)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    
    print(pVals)
    if pVals['omnibus'] > 0.05:
        print('Data are normally distributed')

Exemplo n.º 16

0

Exibir arquivo

Arquivo: __init__.py Projeto: kaelfischer/lib_prrsv

    def gStats(self, missingValue=0.0):
        """dict of {geneID: (min,max,mean,median,std,stderr,
        Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...}
        """
        import scipy as S
        import scipy.stats as SS

        rv = {}
        for k, v in self.items():
            # print k,v
            va = S.array(self.gValues(k, missingValue))

            try:
                normaltest = SS.normaltest(va)
            except:
                normaltest = None
            try:
                shapiro = SS.shapiro(va)
            except:
                shapiro = None

            try:
                rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro)
            except:
                print k, va
                raise
        return rv

Exemplo n.º 17

0

Exibir arquivo

Arquivo: describe.py Projeto: jattenberg/datascience-utilities

def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min())/2),
        present("Range", np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode", stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value", normaltest(np_values)[1])
        ]
    return output

Exemplo n.º 18

0

Exibir arquivo

Arquivo: tp2.py Projeto: smgonzalez/redes-2015

def main(argv=sys.argv):
    route = Route()
    route.trace(argv[1])

    drtts = []

    # Normal Test #
    for ttl, hop in route.hops.items():
        drtts.append(hop.deltaRTTi)

    normal = stats.normaltest(drtts)
    print("** NormalTest **")
    print("k2: ", normal[0], " p-valor: ", normal[1])

    # Test de Grubbs #
    zscores = calculateZScore(drtts)

    N = len(drtts)
    sampleMean = calculateAverage(drtts)
    standarDeviation = calculateStandardDeviation(drtts)

    # Estadistico
    G = (max(drtts) - sampleMean) / standarDeviation

    criticalValue = tDistribution[N]

    print("** GrubbsTest **")
    print("N: ", N)
    print("G: ", G)
    print("CriticalValue: ", criticalValue)

    if criticalValue != None and G > criticalValue:
        print("El DeltaRTT ", max(drtts), " es el enlace transatlantico")

Exemplo n.º 19

0

Exibir arquivo

Arquivo: S8_twoGroups.py Projeto: NanoResearch/statsintro_python

def oneGroup():
    '''Test of mean value of a single set of data'''
    
    print('Single group of data =========================================')
    
    # First get the data
    data = np.array([5260, 5470, 5640, 6180, 6390, 6515, 6805, 7515, 7515, 8230, 8770], dtype=np.float)
    checkValue = 7725   # value to compare the data to
    
    # 4.1.1. Normality test
    # We don't need the first parameter, so we just assign the output to the dummy variable "_"
    (_, p) = stats.normaltest(data)
    if p > 0.05:
        print('Data are distributed normally, p = {0}'.format(p))
        
    # 4.1.2. Do the onesample t-test
    t, prob = stats.ttest_1samp(data, checkValue)
    if prob < 0.05:
        print('With the one-sample t-test, {0:4.2f} is significantly different from the mean (p={1:5.3f}).'.\
        format(checkValue, prob))
    else:
        print('No difference from reference value with onesample t-test.')
    
    # 4.1.3. This implementation of the Wilcoxon test checks for the "difference" of one vector of data from zero
    (_,p) = stats.wilcoxon(data-checkValue)
    if p < 0.05:
        print('With the Wilcoxon test, {0:4.2f} is significantly different from the mean (p={1:5.3f}).'.\
        format(checkValue, p))
    else:
        print('No difference from reference value with Wilcoxon rank sum test.')

Exemplo n.º 20

0

Exibir arquivo

Arquivo: stattools.py Projeto: bashtage/statsmodels

def omni_normtest(resids, axis=0):
    """
    Omnibus test for normality

    Parameters
    ----------
    resid : array-like
    axis : int, optional
        Default is 0

    Returns
    -------
    Chi^2 score, two-tail probability
    """
    # TODO: change to exception in summary branch and catch in summary()
    #   behavior changed between scipy 0.9 and 0.10
    resids = np.asarray(resids)
    n = resids.shape[axis]
    if n < 8:
        from warnings import warn
        warn("omni_normtest is not valid with less than 8 observations; %i "
             "samples were given." % int(n), ValueWarning)
        return np.nan, np.nan

    return stats.normaltest(resids, axis=axis)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: test_source.py Projeto: Chandra-MARX/marxs

def test_disk_distribution(diskclass, diskpar, n_expected):
    '''This is a separate test from test_disk_radius, because it's a simpler
    to write if we don't have to worry about the inner hole.

    For the test itself: The results should be poisson distributed (or, for large
    numbers this will be almost normal).
    That makes testing it a little awkard in a short run time, thus the limits are
    fairly loose.

    This test is run for several extended sources, incl Gaussian. Stirctly speaking
    it should fail for a Gaussian distribution, but if the sigma is large enough it
    will pass a loose test (and still fail if things to catastrophically wrong,
    e.g. some test circles are outside the source).
    '''

    s = diskclass(coords=SkyCoord(213., -10., unit=u.deg), **diskpar)
    photons = s.generate_photons(1e5)

    n = np.empty(20)
    for i in range(len(n)):
        circ = SkyCoord((213. +  np.random.uniform(-0.1, .1)) * u.degree,
                       (- 10. + np.random.uniform(-0.1, .1)) * u.degree)
        d = circ.separation(SkyCoord(photons['ra'], photons['dec'], unit='deg'))
        n[i] = (d < 5. * u.arcmin).sum()
    s, p = normaltest(n)
    # assert a p value here that is soo small that it's never going to be hit
    # by chance.
    assert p > .05
    # better: Test number of expected photons matches
    # Allow large variation so that this is not triggered by chance
    assert np.isclose(n.mean(), n_expected, rtol=.2)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: models.py Projeto: glciampaglia/editor-lifecycle

    def gof(self, x, y, ye):
        '''
        Computes GoF test statistics and other diagnostical tests

        Returns:
        --------
        - GoF test: Chi^2, p-value, and ddof
        - Normality of residuals: K^2 and p-value
        '''
        res = {}
        resid = y - self(x)
        chisq = np.sum(((resid) / ye) ** 2)
        ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
        chisq_pvalue = chisqprob(chisq, ddof)
        gof = (chisq, chisq_pvalue, ddof)
        resid = normaltest(resid)
        ym = y.mean()
        SStot = np.sum((y - ym) ** 2)
        SSerr = np.sum((y - self(x)) ** 2)
        Rsquared = 1.0 - SSerr / SStot
# Besides being buggy, this test for homoscedasticity is supposed to work only
# for linear regressions, hence is not suited for our case, but I'll keep it
# here until I figure out an alternative. Remember to uncomment the import for
# OLS ontop.
#        regresults = OLS(resid ** 2, np.c_[x, x**2]).fit()
#        LM =regresults.rsquared 
#        LM_pvalue = chisqprob(LM, len(x) - ddof)
#        white = (LM, LM_pvalue)
#        return gof, resid, white 
        return gof, resid, Rsquared

Exemplo n.º 23

0

Exibir arquivo

Arquivo: pairedt.py Projeto: jontonsoup/airlinepricechecker

def pairedt(pairs, numSamples):
    results = dict()
    t,v = pairs.items()
    diffs = [t[1][x] - v[1][x] for x in range(len(t[1]))]
    plotDiffs(diffs)
    sampleSize = int(len(diffs)/numSamples)
    indices = range(len(diffs))
    random.shuffle(indices)
    mean_diffs = []
    i = 0
    for sample in range(numSamples):
        total_diff = 0
        for x in range(sampleSize):
            index = indices[i]
            total_diff += diffs[index]
            i+=1
        sample_avg = total_diff/float(sampleSize)
        mean_diffs.append(sample_avg)

    #normality check
    nt = stats.normaltest(mean_diffs)
    results['normal_p'] =  format(round(nt[1],4))

    #ttest
    t_prob = stats.ttest_1samp(mean_diffs, 0)
    results['ttest_t'] =  format(round(t_prob[0],4))
    results['ttest_p'] =  format(round(t_prob[1],4))

    #other stats
    results['avg_diff'] =  format(round(np.mean(diffs),4))
    results['numSamples'] = numSamples
    results['sampleSize'] = sampleSize
    results['num_pairs'] = len(pairs['tor'])

    return results

Exemplo n.º 24

0

Exibir arquivo

Arquivo: sb_analyze.py Projeto: blub123muh/hci

def pearson_or_shapiro(data):
    """pearson_or_shapiro

    Use D'agostino/Pearson if possible (n >= 20), else Shapiro
    :param data:
    """
    return stats.normaltest(data) if len(data) >= 20 else stats.shapiro(data)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: utils.py Projeto: selenarodriguez/Daedalus

def normality_check(feature_group,output_path):

	if feature_group.isEmpty():
		return False

	
	normal_flag = True
	sk_test = stats.skewtest(feature_group.get_scores())
	kr_test = stats.kurtosistest(feature_group.get_scores()) 
	normaltest = stats.normaltest(feature_group.get_scores())

	temp = '''

			Normality Test P-Values
		------------------------------------
		 Kurtosis   |  {0}
		 Skewness   |  {1}
		 NormalTest |  {2}


	'''

	result = temp.format(kr_test[1],sk_test[1],normaltest[1])

	print result


	tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05)

	return tests

Exemplo n.º 26

0

Exibir arquivo

Arquivo: StatsTest.py Projeto: hmc-simplification/TextSimplification

def testNormalByWord( word, version ):

	# Let's find the data - first the word
	for i in range( len(data) ):
		if data[i][0][1][0][1] == word:
			thisWord = i
		elif data[i][1][1][0][1] == word:
			thisWord = i

	# Now the version

	# Get the distribution
	if data[thisWord][0][0][1] == version:
		numbers = data[thisWord][0][1][1]
	elif data[thisWord][1][0][1] == version:
		numbers = data[thisWord][1][1][1]

	# Use scipy to check normality
	( chi, p ) = stats.normaltest( numbers )

	print "Chi-squared: " + str( chi )
	print "P-value: " + str( p )

	if p < 0.05:
		print "Not normal with alpha 0.05"
	else:
		print "Normal with alpha = 0.05"

Exemplo n.º 27

0

Exibir arquivo

Arquivo: stattools.py Projeto: EnricoGiampieri/statsmodels

def omni_normtest(resids, axis=0):
    """
    Omnibus test for normality

    Parameters
    -----------
    resid : array-like
    axis : int, optional
        Default is 0

    Returns
    -------
    Chi^2 score, two-tail probability
    """
    #TODO: change to exception in summary branch and catch in summary()
    #behavior changed between scipy 0.9 and 0.10
    resids = np.asarray(resids)
    n = resids.shape[axis]
    if n < 8:
        return np.nan, np.nan
        return_shape = list(resids.shape)
        del return_shape[axis]
        return np.nan * np.zeros(return_shape), np.nan * np.zeros(return_shape)
        raise ValueError(
            "skewtest is not valid with less than 8 observations; %i samples"
            " were given." % int(n))

    return stats.normaltest(resids, axis=axis)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: preprocess.py Projeto: swatisaoji1/DataMining-Implementation-Naive-Bayesian-Classifier

def fillMissing1(df, dataType):
    '''
    Args:
        df ( 2d array/ Dict):
                             eg : ('attribute1': [12, 24, 25] , 'attribute2': ['good', 'bad'])
        dataTypes (dict): Dictionary of attribute names of df as keys and values 0/1 
                            indicating categorical/continuous variable eg:  ('attribute1':1, 'attribute2': 0)
                            
    Returns:
        writes a file with missing values replaces.    
    
    
    '''
    dataLabels = list(df.columns.values)
    for eachlabel in dataLabels:
        if dataType[eachlabel] is 1:
            
            # check if data is normal
            _,pval = stats.normaltest(df[eachlabel])
            if(pval < 0.5):
                # if the data is not normal use median of the group to replace the missing
                df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.median()))
            else:
                # if the data is not normal use mean of the group to replace the missing
                df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.mean()))
        else:
            #for categorical data use mode ( the most frequent value ) to replace the missing
            df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.mode()[0]))
            
    df.to_csv(Globals.MISSING_REPLACED_FILE)
    return df, Globals.MISSING_REPLACED_FILE

Exemplo n.º 29

0

Exibir arquivo

Arquivo: base_stat_test_swfilter.py Projeto: w495/python-video-shot-detector

 def normal_test(features, **_):
     """
     
     :param features: 
     :param _: 
     :return: 
     """
     return stats.normaltest(features)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_mstats_basic.py Projeto: bulli92/scipy

 def test_normaltest(self):
     for n in self.get_n():
         if n > 8:
             x,y,xm,ym = self.generate_xy_sample(n)
             r = stats.normaltest(x)
             rm = stats.mstats.normaltest(xm)
             assert_almost_equal(r[0],rm[0],10)
             assert_almost_equal(r[1],rm[1],10)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: reddit_weekends.py Projeto: aauutthh/data-science

def perform_tests(weekends, weekdays):
    weekends_normaltest = stats.normaltest(weekends['comment_count'])
    weekdays_normaltest = stats.normaltest(weekdays['comment_count'])
    levene = stats.levene(weekends['comment_count'], weekdays['comment_count'])
    ttest = stats.ttest_ind(weekends['comment_count'], weekdays['comment_count'])
    return weekends_normaltest, weekdays_normaltest, levene, ttest

Exemplo n.º 32

0

Exibir arquivo

 def omni(self):
     """
     Omnibus test for normality
     """
     return stats.normaltest(self.e)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: SUM_PythonClass4.py Projeto: JLefortBesnard/PythonClass

sol.components_  # to get the answer, so the value of each weight in the components
sol.explained_variance_  # the explained variance
sol.explained_variance_ratio_  # the expalined variance in %

# A bit of classical statistics:
from scipy import stats

X1 = np.random.random(
    (20))  # let's create 2 variables with 20 observations each
X2 = np.random.random((20))

X1_stand = stats.zscore(X1)  # Another way to standardize
X2_stand = stats.zscore(X2)

stats.sem(X1)  # standard error of the mean
stats.normaltest(X2)  # test for normality

stats.chisquare(
    [12, 14, 16, 18, 10, 10]
)  # chisquare (each entry represents the category and how many times they appear)
stats.rankdata(X1)  # rank the data, useful for non parametric tests

stats.ttest_ind(X1, X2)  # independant t test
stats.ttest_rel(X1, X2)  # dependant t test

stats.mannwhitneyu(X1, X2)  # Mann Whitney U test (non parametric)
stats.wilcoxon(X1, X2)  # Wilcoxon test (non parametric)

stats.spearmanr(X1, X2)  # spearman correlation

stats.linregress(

Exemplo n.º 34

0

Exibir arquivo

Arquivo: Final_Report(Anova).py Projeto: blueleen2/Final_Report

# -*- coding: utf-8 -*-
"""
Created on Sun Dec 21 20:36:32 2014

@author: JN
"""

import pandas as pd
import statsmodels.api as sm
import pylab as pl
import scipy.stats as stats

Raw_data = pd.read_csv('C:/Users/JN/Desktop/AnovaData.csv')

print Raw_data.describe()

Raw_data.hist()

pl.show()

print Raw_data.BPM

print stats.normaltest(Raw_data.BPM)

New_Column = ['RSP_Cycle', 'BPM']

New_Raw_Data = Raw_data[New_Column]

print New_Raw_Data

print stats.mstats.kruskalwallis(New_Raw_Data)

Exemplo n.º 35

0

Exibir arquivo

import scipy.stats as stats
import numpy as np

data = pd.read_csv("data.csv", encoding="gbk")

col1 = data[u'2013年GDP(亿元)']
col2 = data[u'较2012年实际增长率'].dropna().map(lambda x: float(x[:-1]))
sc = (col2 - np.mean(col2)) / np.std(col2)

# powerlaw test

fit = powerlaw.Fit(col1)
R, p = fit.distribution_compare('power_law', 'lognormal')
print 'R', R, 'p', p
print "power_law fit is wrong than to lognormal!"

fig4 = fit.plot_ccdf(linewidth=2)
fit.power_law.plot_ccdf(ax=fig4, color='r', linestyle='--')
fit.lognormal.plot_ccdf(ax=fig4, color='g', linestyle='--')
plt.show()

# norm test

des = stats.describe(col2)

omnibus, p_n = stats.normaltest(col2)

print 'p', p_n, 'it is not a norm distribution however'

plt.hist(col2)
plt.show()

Exemplo n.º 36

0

Exibir arquivo

def isInt(s):
    try:
        int(s)
        return True
    except ValueError:
        return False


if __name__ == "__main__":

    argFiles = []
    nameFiles = []
    values = []
    result = 0
    results = []
    nameFiles = os.listdir(sys.argv[1])
    nameFiles.remove("source.py")
    with open(os.path.join(sys.argv[1], nameFiles[0]), 'r') as f:
        arg = f.readline()

    values = arg.split(' ')
    if (len(values) >= 20):
        for i in range(len(values)):
            results.append(float(values[i]))

        statist, hi_2 = stats.normaltest(results)
        with open(os.path.join(sys.argv[1], str(1) + "output.txt"), 'w') as f:
            f.write(str(hi_2))
    else:
        exit(-1)

Exemplo n.º 37

0

Exibir arquivo

def dAgostinaTest(data):
    print(len(data))
    stat, p = normaltest(data)
    print(p)

Exemplo n.º 38

0

Exibir arquivo

Arquivo: figure4efgh_variability_over_labs_full.py Projeto: histonemark/paper-behavior

                              'bias_r': right_fit['bias'],
                              'lapselow_l': left_fit['lapselow'],
                              'lapselow_r': right_fit['lapselow'],
                              'lapsehigh_l': left_fit['lapsehigh'],
                              'lapsehigh_r': right_fit['lapsehigh'],
                              'nickname': nickname, 'lab': lab})
    biased_fits = biased_fits.append(fits, sort=False)

# %% Statistics
    
stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}

for i, var in enumerate(['threshold_l', 'threshold_r', 'lapselow_l', 'lapselow_r', 'lapsehigh_l',
                         'lapsehigh_r', 'bias_l', 'bias_r']):
    _, normal = stats.normaltest(biased_fits[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(*[group[var].values
                               for name, group in biased_fits.groupby('lab')])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(biased_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(*[group[var].values
                                for name, group in biased_fits.groupby('lab')])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(biased_fits, val_col=var, group_col='lab')

Exemplo n.º 39

0

Exibir arquivo

Arquivo: pnc2d.py Projeto: gc13141112/pseudonetcdf

def make2ds(args):
    ifiles = args.ifiles
    if len(args.figure_keywords) > 0:
        plt.setp(fig, **args.figure_keywords)
    if len(args.axes_keywords) > 0:
        plt.setp(ax, **args.axes_keywords)
    nborders = len(ax.collections)
    for fi, ifile in enumerate(ifiles):
        variables = args.variables
        if variables is None:
            variables = [
                key for key, var in ifile.variables.items() if var.ndim == 2
            ]
        if len(variables) == 0:
            raise ValueError(
                'Unable to heuristically determin plottable variables; use -v to specify variables for plotting'
            )
        for varkey in variables:
            var = ifile.variables[varkey]
            vals = var[:]
            if args.squeeze:
                vals = vals.squeeze()

            if args.normalize is None:
                from scipy.stats import normaltest
                vmin, vmax = vals.min(), vals.max()
                if normaltest(vals.ravel())[1] < 0.05:
                    cvals = np.ma.compressed(vals)
                    boundaries = np.percentile(cvals, np.arange(0, 110, 10))
                    warn(
                        'Autoselect deciles colormap of %s; override width --norm'
                        % varkey)
                else:
                    boundaries = np.linspace(vmin, vmax, num=11)
                    warn(
                        'Autoselect linear colormap of %s; override width --norm'
                        % varkey)
                if (boundaries.max() /
                        np.ma.masked_values(boundaries, 0).min()) > 10000:
                    formatter = LogFormatter(labelOnlyBase=False)
                else:
                    formatter = None
                norm = BoundaryNorm(boundaries, ncolors=256)
            else:
                norm = eval(args.normalize)
                formatter = None
            if not args.colorbarformatter is None:
                try:
                    formatter = eval(args.colorbarformatter)
                except:
                    formatter = args.colorbarformatter

            vmin, vmax = vals.min(), vals.max()
            if not norm.vmin is None:
                vmin = norm.vmin
            if not norm.vmax is None:
                vmax = norm.vmax

            varunit = getattr(var, 'units', 'unknown').strip()
            vardims = [
                dk for dk, dv in zip(var.dimensions, var.shape) if dv != 1
            ]
            print(varkey, sep='')
            del ax.collections[nborders:]
            if args.swapaxes:
                patches = ax.pcolor(vals.T, norm=norm)
                ax.set_xlabel(vardims[0])
                ax.set_ylabel(vardims[1])
            else:
                patches = ax.pcolor(vals, norm=norm)
                ax.set_xlabel(vardims[1])
                ax.set_ylabel(vardims[0])

            height = vals.shape[0]
            width = vals.shape[1]
            if width >= height:
                orientation = 'horizontal'
            else:
                orientation = 'vertical'
            try:
                cax = cbar.ax
                cax.cla()
            except:
                cax = None
            if vals.max() > vmax and vals.min() < vmin:
                extend = 'both'
            elif vals.max() > vmax:
                extend = 'max'
            elif vals.min() < vmin:
                extend = 'min'
            else:
                extend = 'neither'
            cbar = fig.colorbar(patches,
                                orientation=orientation,
                                cax=cax,
                                extend=extend,
                                format=formatter)
            del cbar.ax.texts[:]
            cbar.set_label(varkey + ' (' + varunit + '; min=%.3g; max=%.3g)' %
                           (var[:].min(), var[:].max()))
            #           if orientation == 'vertical':
            #               cbar.ax.text(.5, 1.05, '%.3g' % var[:].max(), horizontalalignment = 'center', verticalalignment = 'bottom')
            #                cbar.ax.text(.5, -.06, '%.3g ' % var[:].min(), horizontalalignment = 'center', verticalalignment = 'top')
            #            else:
            #                cbar.ax.text(1.05, .5, ' %.3g' % var[:].max(), verticalalignment = 'center', horizontalalignment = 'left')
            #                cbar.ax.text(-.06, .5, '%.3g ' % var[:].min(), verticalalignment = 'center', horizontalalignment = 'right')
            #cbar.update_ticks()
            fmt = 'png'
            outpath = args.outpath
            if len(ifiles) > 1:
                lstr = str(fi).rjust(len(str(len(ifiles))), '0')
            else:
                lstr = ''

            figpath = os.path.join(outpath + varkey + lstr + '.' + fmt)
            if args.interactive:
                csl = PNCConsole(locals=globals())
                csl.interact()

            fig.savefig(figpath)
            if args.verbose > 0: print('Saved fig', figpath)

Exemplo n.º 40

0

Exibir arquivo

import pandas as pd
from statsmodels.formula.api import ols 
from statsmodels.stats.anova import anova_lm
import scipy.stats as ss

############
# one way
############

# prepare
df = pd.read_csv('oneway.csv')
a = df[df['algo'] == 'a']['ratio']
b = df[df['algo'] == 'b']['ratio']

# 1/4: 正态性
ss.normaltest(a); ss.normaltest(b)

# 2/4: 方差齐性
args=[a,b]
ss.levene(*args)

# F test
ss.f_oneway(*args)

# F test too
model = ols('ratio ~ algo', df).fit()
anovat = anova_lm(model)

Exemplo n.º 41

0

Exibir arquivo

Arquivo: create_violin_plots.py Projeto: salma1601/DESCRIBING_DATA

def calc_ttest_dict(a, b, paired=False):
    '''
    Calculate the comparison between the two sets of data
    
    Importantly, although the stars will be the same, this code
    accurately applies either a Student's t, Welch's t, or Mann Whitney U
    test
    '''
    # Import what you need
    import numpy as np
    from scipy.stats import ttest_ind, ttest_rel, bartlett, mannwhitneyu, normaltest, wilcoxon
    
    stats_dict = {}
    
    # Mask out the not a numbers
    a = [ x for x in a if not np.isnan(x) ]
    b = [ x for x in b if not np.isnan(x) ]

    # Save number of people in each group
    stats_dict['n'] = (len(a), len(b))
    
    # Conduct test for equal variance
    stats_dict['eqvar'] = bartlett(a, b)
    
    # Conduct test for normality
    stats_dict['normal'] = normaltest(np.hstack([a, b]))
    
    # When you test for equal means (ttest) you have different options
    # depending on if you have equal variances or not. You can also
    # run the non-parametric Mann Whitney U test
    # Alternatively these data may be paired so there's also the
    # paired t-test and the Wilcoxon signed rank test
    
    # All five will be entered in the stats_dict
    
    # Conduct Welch's t-test (unequal variances)
    stats_dict['ttest_uneqvar'] = ttest_ind(a, b, equal_var = False)

    # Conduct standard student's t-test (equal variances)
    stats_dict['ttest_eqvar'] = ttest_ind(a, b, equal_var = True)

    # Conduct mann whitney U test (non-parametric test of medians)
    stats_dict['mannwhitneyu'] = mannwhitneyu(a, b)
    
    if paired:
        # Conduct the paired student's t-test
        stats_dict['ttest_paired'] = ttest_rel(a, b)
    
        # Conduct Wilcoxon signed rank test (non-parametric *paired* test of medians)
        stats_dict['wilcoxon'] = wilcoxon(a, b)

    # Save in the stats dict the various other measures you might
    # want to report
    stats_dict['medians'] = [np.percentile(a, 50), np.percentile(b, 50)]
    stats_dict['percentile25'] = [np.percentile(a, 25), np.percentile(b, 25)]
    stats_dict['percentile75'] = [np.percentile(a, 75), np.percentile(b, 75)]
    stats_dict['means'] = [np.mean(a), np.mean(b)]
    stats_dict['stds'] = [np.std(a), np.std(b)]
    stats_dict['dfs'] = [(np.float(stats_dict['n'][0])-1), (np.float(stats_dict['n'][1])-1)]
    stats_dict['pooled_std'] = np.sqrt( (np.float(stats_dict['dfs'][0])*(np.float(stats_dict['stds'][0])**2)
                                     + np.float(stats_dict['dfs'][1])*(np.float(stats_dict['stds'][0])**2))
                                     / (np.float(stats_dict['dfs'][0]) + np.float(stats_dict['dfs'][1])))
    
    if paired:
        stats_dict['mean_difference'] = np.mean(np.array(b)-np.array(a))
        stats_dict['std_difference'] = np.std(np.array(b)-np.array(a))
        stats_dict['median_difference'] = np.percentile(np.array(b)-np.array(a), 50) 
        stats_dict['percentile25_difference'] = np.percentile(np.array(b)-np.array(a), 25) 
        stats_dict['percentile75_difference'] = np.percentile(np.array(b)-np.array(a), 75)
        stats_dict['cohens_d'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['pooled_std'])
        stats_dict['cohens_d_paired'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['std_difference'])

    return stats_dict

Exemplo n.º 42

0

Exibir arquivo

    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat8, p8 = ttest_ind(Parents_2019["soc_omg_1"], Parents_2019["soc_omg_2"])
print('stat=%.3f, p=%.3f' % (stat8, p8))
if p8 > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

#Test all the assumptions

#test whether normal distributions

stat, p = normaltest(Moved_out_2020["attitu_2"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

stat3, p3 = normaltest(Parents_2020["attitu_2"])
print('stat=%.3f, p=%.3f' % (stat3, p3))
if p3 > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

stat9, p9 = normaltest(Moved_out_2019["attitu_2"])
print('stat=%.3f, p=%.3f' % (stat9, p9))

Exemplo n.º 43

0

Exibir arquivo

from termcolor import colored, cprint
import matplotlib.pyplot as plt

import numpy as np

import numpy as np
from scipy import stats
# Generating a normal distribution sample # with 100 elements
sample = np.random.randn(20)
print(colored(('sample', sample), 'green'))
# normaltest tests the null hypothesis.
out = stats.normaltest(sample)
print('normaltest output')
print('Z-score = ' + str(out[0]))
print('P-value = ' + str(out[1]))
# kstest is the Kolmogorov-Smirnov test for goodness of fit.
# Here its sample is being tested against the normal distribution.
# D is the KS statistic and the closer it is to 0 the better.
out = stats.kstest(sample, 'norm')
print('\nkstest output for the Normal distribution')
print('D = ' + str(out[0]))
print('P-value = ' + str(out[1]))
# Similarly, this can be easily tested against other distributions,
# like the Wald distribution.
out = stats.kstest(sample, 'wald')
print('\nkstest output for the Wald distribution')
print('D = ' + str(out[0]))
print('P-value = ' + str(out[1]))

Exemplo n.º 44

0

Exibir arquivo

# resample data to create time bars and compare normality tests with tick data
def get_bar_stats(agg_trades):
    vwap = agg_trades.apply(
        lambda x: np.average(x.price, weights=x.shares)).to_frame('vwap')
    ohlc = agg_trades.price.ohlc()
    vol = agg_trades.shares.sum().to_frame('vol')
    txn = agg_trades.shares.size().to_frame('txn')
    return pd.concat([ohlc, vwap, vol, txn], axis=1)


resampled = trades.resample('1Min')
time_bars = get_bar_stats(resampled)

# norrmality test for tick rets
normaltest(tick_bars.price.pct_change().dropna())
# compare to min rets
normaltest(time_bars.vwap.pct_change().dropna())
price_volume(time_bars)

# time bars don't always account for fragmentation of orders. Volume bars offer an alternative perspective
with pd.HDFStore(order_book_store) as store:
    trades = store['{}/trades'.format(stock)]

trades.price = trades.price.mul(1e-4)
trades = trades[trades.cross == 0]
trades = trades.between_time(market_open, market_close).drop('cross', axis=1)
trades.info()
trades_per_min = trades.shares.sum() / (60 * 7.5)  # min per trading day
trades['cumul_vol'] = trades.shares.cumsum()

Exemplo n.º 45

0

Exibir arquivo

Arquivo: main.py Projeto: jdrcabral/codenation-aceleradev-ds

def q4():
    # Retorne aqui o resultado da questão 4.
    log_weight = np.log(amostra_weight)
    statistic, p_value = sct.normaltest(log_weight)
    return bool(p_value > ALPHA)

Exemplo n.º 46

0

Exibir arquivo

import numpy as np
from scipy import stats
pts = 1000
np.random.seed(28041990)
a = np.random.normal(0, 1, size=pts)
b = np.random.normal(2, 1, size=pts)
x = np.concatenate((a, b))
k2, p = stats.normaltest(x)
alpha = 0.05
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

Exemplo n.º 47

0

Exibir arquivo

Arquivo: main.py Projeto: jdrcabral/codenation-aceleradev-ds

def q3():
    # Retorne aqui o resultado da questão 3.    
    statistic, p_value = sct.normaltest(amostra_weight)
    return bool(p_value > ALPHA)

Exemplo n.º 48

0

Exibir arquivo

Arquivo: regression_taskrabbit.py Projeto: javierkos/TaskRabbit

def get_all_subsets(X, y, mallows=True):
    combs = []
    results = []
    for i in range(1, len(X) + 1):
        els = [list(x) for x in itertools.combinations(X, i)]
        combs.extend(els)
    for comb in combs:
        model = sm.OLS(y, sm.add_constant(X[list(comb)]))
        result = model.fit()
        results.append({
            "model": model,
            "result": result,
            "num_vars": len(comb),
            "vars": X[list(comb)]
        })

    full_mse_res = sm.OLS(y, sm.add_constant(X)).fit().mse_resid
    acceptable_models = {}

    for model in results:
        not_acceptable = False
        for pvalue in model["result"].pvalues:
            if pvalue > 0.05:
                not_acceptable = True
                break
        if not_acceptable:
            continue

        mallows_objective = model["num_vars"]
        curr_mallows = mallow_cp(model, full_mse_res, X.shape[0])
        curr_min = None
        if model["num_vars"] in acceptable_models and len(
                acceptable_models[model["num_vars"]]) > 9:
            curr_min = acceptable_models[model["num_vars"]][-1]["mallows"]

        model["mallows"] = curr_mallows
        model["mallows_diff"] = abs(curr_mallows - mallows_objective)
        if not curr_min is None:
            if model["mallows_diff"] < abs(curr_min - mallows_objective):
                del acceptable_models[model["num_vars"]][-1]
                acceptable_models[model["num_vars"]].append(model)
            else:
                continue
        else:
            if not model["num_vars"] in acceptable_models:
                acceptable_models[model["num_vars"]] = []
            acceptable_models[model["num_vars"]].append(model)

        acceptable_models[model["num_vars"]] = \
            sorted(acceptable_models[model["num_vars"]], key=lambda k: k['mallows_diff'])

    curr_best = None
    for num_vars in acceptable_models:
        for model in acceptable_models[num_vars]:

            if curr_best is None:
                curr_best = model
            else:
                if curr_best["mallows_diff"] > model["mallows_diff"]:
                    curr_best = model

    print(curr_best["result"].summary())
    std = curr_best["model"].exog.std(0)
    std[0] = 1
    tt = curr_best["result"].t_test(np.diag(std))
    print(tt.summary())
    tt.summary_frame()

    fig = plt.figure(figsize=(12, 30))
    sm.graphics.plot_partregress_grid(curr_best["result"])
    plt.savefig("resid_ny.png")
    #plt.show()
    if False:
        fig, ax = plt.subplots(2,
                               2,
                               sharex='col',
                               sharey='row',
                               figsize=(12, 10))
        params = list(dict(curr_best["result"].params).keys())

        n1 = math.floor(len(params) / 2)
        n2 = math.floor(len(params) % 2)

        for i in range(2):
            for j in range(2):
                try:
                    ax[i,
                       j].scatter(curr_best["result"].model.exog[:, i * 2 + j],
                                  curr_best["result"].resid)
                    ax[i, j].set_xlabel(params[i * 2 + j])
                    ax[i, j].set_ylabel("resid")
                    ax[i, j].axhline(y=0, color="black")
                except Exception:
                    break
        plt.savefig("resid_sf.png")
        plt.show()
        #fig = plt.figure(figsize=(12, 10))
        #fig = sm.graphics.plot_regress_exog(curr_best["result"], "per_white", fig=fig)
        fig = sm.graphics.plot_partregress_grid(curr_best["result"], fig=fig)
        fig.gca().set_title("")
        plt.suptitle("")
        plt.savefig("resid_ny.png")
        #plt.show()

    stat, p = shapiro(curr_best["result"].resid)
    print("Shapiro")
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    stat, p = normaltest(curr_best["result"].resid)
    print("D’Agostino’s")
    print('Statistics=%.3f, p=%.3f' % (stat, p))

    stat, p = kstest(curr_best["result"].resid, 'norm')
    print("Kolmogorov-Smirnov")
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    #plot_mallows(acceptable_models)
    return curr_best["result"].rsquared_adj

    for var in curr_best["vars"]:
        coef = curr_best["result"].params[var]
        pos_neg = "pos"
        if coef < 0:
            pos_neg = "neg"
        try:
            dct[var + "_" + pos_neg] += 1
        except Exception:
            dct[var + "_" + pos_neg] = 1

Exemplo n.º 49

0

Exibir arquivo

Arquivo: tsexp.py Projeto: yeedas/avenir

    #normalcy test
    elif op == "jarqBera":
        jb, jbpv, skew, kurtosis = jarque_bera(data)
        printStat(jb, jbpv, "probably gaussian", "probably not gaussian")
        print(f'skew: {skew}')
        print(f'kurtosis: {kurtosis}')

    #shapiro wilks normalcy test
    elif op == "shapWilk":
        stat, pvalue = shapiro(data)
        printStat(stat, pvalue, "probably gaussian", "probably not gaussian")

    #D’Agostino’s K square  normalcy test
    elif op == "dagast":
        stat, pvalue = normaltest(data)
        printStat(stat, pvalue, "probably gaussian", "probably not gaussian")

    #anderson darling normalcy test
    elif op == "andar":
        result = anderson(data)
        print("stat {:.3f}".format(result.statistic))
        for i in range(len(result.critical_values)):
            sl, cv = result.significance_level[i], result.critical_values[i]
            if int(sl) == 5:
                if result.statistic < cv:
                    print("probably gaussian at the {:.1f} level".format(sl))
                else:
                    print(
                        "probably not gaussian at the {:.1f} level".format(sl))
    #histogram

Exemplo n.º 50

0

Exibir arquivo

Arquivo: main.py Projeto: Andrezza16/codenation_1

#
# * Esse resultado faz sentido?

# <font size = '5' color = 'green'>Este resultado é qualitativamente igual ao resultado fornecido pelo teste de Shapiro-Wilk, diferindo apenas quantitativamente em relação ao valor p, portanto, faz sentido. </font>

# ## Questão 3
#
# Considerando agora uma amostra de tamanho 3000 da coluna `weight` obtida com a função `get_sample()`. Faça o teste de normalidade de D'Agostino-Pearson utilizando a função `scipy.stats.normaltest()`. Podemos afirmar que os pesos vêm de uma distribuição normal ao nível de significância de 5%? Responda com um boolean (`True` ou `False`).

# In[14]:

sub_weight = get_sample(df, 'weight', n=3000)

# In[15]:

ap_t, ap_pvalue = sct.normaltest(sub_weight)
ap_pvalue

# In[16]:


def q3():
    return (ap_pvalue > 0.05)
    # Retorne aqui o resultado da questão 3.
    pass


q3()

# In[17]:

Exemplo n.º 51

0

Exibir arquivo

Arquivo: suppfig_variability_over_labs_first_biased.py Projeto: int-brain-lab/paper-behavior

# %% Statistics

stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}

for i, var in enumerate([
        'perf_easy', 'threshold_l', 'threshold_r', 'threshold_n', 'bias_l',
        'bias_r', 'bias_n'
]):

    # Remove any animals with NaNs
    test_fits = biased_fits[biased_fits[var].notnull()]

    # Test for normality
    _, normal = stats.normaltest(test_fits[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(
            *[group[var].values for name, group in test_fits.groupby('lab')])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(test_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(
            *[group[var].values for name, group in test_fits.groupby('lab')])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(test_fits, val_col=var, group_col='lab')

Exemplo n.º 52

0

Exibir arquivo

    def get_corr_matrix_data(self,
                             options,
                             included_vars=None,
                             extra_vars=None):
        if included_vars is None:
            included_vars = list(self.data)
        if extra_vars is not None:
            included_vars = included_vars + extra_vars
        else:
            extra_vars = []

        categories = [
            c for c in list(self.data)
            if 'date' not in c.lower() and c in included_vars
        ]
        categories.extend(extra_vars)
        categories = list(set(categories))
        categories.sort()
        var_count = len(categories)
        categories_for_label = [
            category.replace("Control Point", "CP") for category in categories
        ]
        categories_for_label = [
            category.replace("control point", "CP")
            for category in categories_for_label
        ]
        categories_for_label = [
            category.replace("Distance", "Dist")
            for category in categories_for_label
        ]

        for i, category in enumerate(categories_for_label):
            if category.startswith('DVH'):
                categories_for_label[i] = category.split("DVH Endpoint: ")[1]

        x_factors = categories_for_label
        y_factors = categories_for_label[::-1]

        s_keys = [
            'x', 'y', 'x_name', 'y_name', 'color', 'alpha', 'r', 'p', 'size',
            'x_normality', 'y_normality', 'group'
        ]
        source_data = {
            'corr': {sk: []
                     for sk in s_keys},
            'line': {
                'x': [0.5, var_count - 0.5],
                'y': [var_count - 0.5, 0.5]
            }
        }

        min_size, max_size = 3, 20
        removed_mrns = set()
        for x in range(var_count):
            for y in range(var_count):
                if x > y and self.group == 1 or x < y and self.group == 2:
                    if categories[x] not in extra_vars and categories[
                            y] not in extra_vars:

                        bad_indices = [
                            i for i, v in enumerate(self.data[categories[x]]
                                                    ['values'])
                            if type(v) in [str, type(None)]
                        ]
                        bad_indices.extend([
                            i for i, v in enumerate(self.data[categories[y]]
                                                    ['values'])
                            if type(v) in [str, type(None)]
                        ])
                        bad_indices = list(set(bad_indices))
                        removed_mrns = removed_mrns.union(
                            set(self.mrns[i] for i in bad_indices))

                        x_data = [
                            v for i, v in enumerate(self.data[categories[x]]
                                                    ['values'])
                            if i not in bad_indices
                        ]
                        y_data = [
                            v for i, v in enumerate(self.data[categories[y]]
                                                    ['values'])
                            if i not in bad_indices
                        ]

                        if x_data and len(x_data) == len(y_data):
                            r, p_value = scipy_stats.pearsonr(x_data, y_data)
                        else:
                            r, p_value = 0, 0
                        if np.isnan(r):
                            r = 0

                        sign = ['neg', 'pos'][r >= 0]
                        color = getattr(
                            options, 'CORRELATION_%s_COLOR_%s' %
                            (sign.upper(), self.group))
                        source_data['corr']['color'].append(color)
                        source_data['corr']['r'].append(r)
                        source_data['corr']['p'].append(p_value)
                        source_data['corr']['alpha'].append(abs(r))
                        source_data['corr']['size'].append((
                            (max_size - min_size) * abs(r)) + min_size)
                        source_data['corr']['x'].append(
                            x + 0.5)  # 0.5 offset due to bokeh 0.12.9 bug
                        source_data['corr']['y'].append(
                            var_count - y -
                            0.5)  # 0.5 offset due to bokeh 0.12.9 bug
                        source_data['corr']['x_name'].append(
                            categories_for_label[x])
                        source_data['corr']['y_name'].append(
                            categories_for_label[y])
                        source_data['corr']['group'].append(self.group)

                        try:
                            x_norm, x_p = scipy_stats.normaltest(x_data)
                        except ValueError:
                            x_p = 'N/A'
                        try:
                            y_norm, y_p = scipy_stats.normaltest(y_data)
                        except ValueError:
                            y_p = 'N/A'

                        source_data['corr']['x_normality'].append(x_p)
                        source_data['corr']['y_normality'].append(y_p)

        return {
            'source_data': source_data,
            'x_factors': x_factors,
            'y_factors': y_factors
        }, removed_mrns

Exemplo n.º 53

0

Exibir arquivo

def analyze(initDate, finalDate, data_type="daily"):

    exchange = 'CCCAGG'
    completeOnly = True
    exWeekends = False

    # aggregated hourly price for Bitcoin (2000 row limit - use a loop)
    symbol = 'BTCUSD'
    BTCUSD = gd.getCrypto(symbol,
                          initDate,
                          finalDate,
                          exchange,
                          completeOnly,
                          exWeekends,
                          data_type=data_type)

    symbol = 'LTCBTC'
    LTCBTC = gd.getCrypto(symbol,
                          initDate,
                          finalDate,
                          exchange,
                          completeOnly,
                          exWeekends,
                          data_type=data_type)

    symbol = 'ETHBTC'
    ETHBTC = gd.getCrypto(symbol,
                          initDate,
                          finalDate,
                          exchange,
                          completeOnly,
                          exWeekends,
                          data_type=data_type)

    # store to disk
    BTCUSD.to_csv('./csv/BTCUSD.csv')
    LTCBTC.to_csv('./csv/LTCBTC.csv')
    ETHBTC.to_csv('./csv/ETHBTC.csv')

    # convert to pctdiffs
    dBTC = (BTCUSD.diff() / BTCUSD.shift()).dropna()
    dLTC = (LTCBTC.diff() / LTCBTC.shift()).dropna()
    dETH = (ETHBTC.diff() / ETHBTC.shift()).dropna()

    agg = pd.DataFrame([dBTC.Close, dLTC.Close, dETH.Close]).transpose()
    agg.columns = ['dBTC', 'dLTC', 'dETH']

    # check correlations
    cAgg = np.corrcoef(agg.dropna(), rowvar=False)
    vAgg = np.cov(agg.dropna(), rowvar=False)

    # cut bottom 1% and top 1% of data points - prune outliers
    def middle(series, percentile):
        temp = series.sort_values(inplace=False)
        pctLen = int(round(len(temp) * percentile / 2, 0))
        temp = temp[pctLen:len(temp) - pctLen].sort_index()
        return temp

    # test for stationarity
    percentile = .02
    spreadLTC = (dLTC / dBTC).Close.dropna()
    spreadETH = (dETH / dBTC).Close.dropna()

    # sBTC = adfuller(dBTC.Close)
    # sLTCBTC = adfuller(spreadLTC)
    # sIOTBTC = adfuller(spreadIOT)
    # sETHBTC = adfuller(spreadETH)

    # if stationary and correlated, check for normal distribution
    k2, p = stats.normaltest(spreadLTC)  # p <= .05

    mLTC = middle((dLTC / dBTC).Close.dropna(), percentile)
    mETH = middle((dETH / dBTC).Close.dropna(), percentile)

    sdLTC = np.std(mLTC)
    mnLTC = np.mean(mLTC)
    assdLTC = spreadLTC / sdLTC  # not using middles

    # display histogram
    spreadLTC.hist(range=[-20, 20], bins=100)
    assdLTC.hist(range=[-5, 5], bins=100)

    # sanity check
    prunedPct = len(assdLTC[np.abs(assdLTC) >= 3]) / len(assdLTC) + percentile

    # slice into sd levels and check autocorrelations
    def checkAutocorrelations(series, sdbottom, sdtop, lags):
        glomSeries = pd.DataFrame(series)
        for lag in range(1, lags + 1):
            glomSeries = glomSeries.join(pd.DataFrame(series.shift(lag)),
                                         rsuffix=str(lag),
                                         how='outer')
        subSeries = glomSeries[(np.abs(glomSeries.Close) >= sdbottom)
                               & (np.abs(glomSeries.Close) < sdtop)].dropna()
        corrs = np.corrcoef(subSeries, rowvar=False)

        mainCol = subSeries.Close

        winProps = np.empty(0)
        for col in subSeries.columns:
            winners = subSeries[(((mainCol > 0) & (mainCol > subSeries[col]))
                                 | ((mainCol < 0) &
                                    (mainCol < subSeries[col])))]
            winProp = len(winners) / len(subSeries)
            winProps = np.append(winProps, winProp)

        return corrs[0], winProps

    # check autocorrelation
    priorSD = 0
    for thisSD in np.arange(0.25, 5.25, 0.25):
        cor, win = checkAutocorrelations(spreadLTC, priorSD, thisSD, 9)
        print(thisSD, "C", cor)
        print(thisSD, "W", win)
        priorSD = thisSD

    return

Exemplo n.º 54

0

Exibir arquivo

Arquivo: statistics.py Projeto: myron-z-zhang/python-code

from scipy import stats
import matplotlib.pyplot as plt

generated = stats.norm.rvs(size=900)

print "Mean", "Std", stats.norm.fit(generated)

print "Skewtest", "pvalue", stats.skewtest(generated)

print "Kurtosistest", "pvalue", stats.kurtosistest(generated)

print "normaltest", "pvalue", stats.normaltest(generated)

print "95 percentile", stats.scoreatpercentile(generated, 95)

print "Percentile at 1", stats.percentileofscore(generated, 1)

plt.hist(generated)
plt.show()

Exemplo n.º 55

0

Exibir arquivo

Arquivo: main.py Projeto: WeslleyCSantos/Aceleradev

def q4():
    # Retorne aqui o resultado da questão 4.
    weight_log = np.log(weight)
    k2, p = sct.normaltest(weight_log)
    return bool(p > alpha)

Exemplo n.º 56

0

Exibir arquivo

Arquivo: lesson31_1.py Projeto: zhouy5/GeekTime

"""
显著性检验：方差分析（Analysis of Variance，ANOVA，F 检验）
随机性：样本是随机采样但
独立性：来自不同组但样本是相互独立但
正太分布性：组内样本都来自一个正太分布
方差齐性：不同组但方差相等或相近
"""

# 读取数据， d1 对应于算法 a，d2 对应于算法 b
df = pd.read_csv("./oneway.csv")
d1 = df[df['algo'] == 'a']['ratio']
d2 = df[df['algo'] == 'b']['ratio']

# 检验两个水平的正态性
print('---------------- 检验两个水平的正态性 ----------------')
print(ss.normaltest(d1))
print(ss.normaltest(d2))

# 检测两个水平的方差齐性
print('---------------- 检测两个水平的方差齐性 ----------------')
args = [d1, d2]
print(ss.levene(*args))

# F 检验的第一种方法
print('---------------- F 检验的第一种方法 ----------------')
print(ss.f_oneway(*args))

# F 检验的第二种方法
print('---------------- F 检验的第二种方法 ----------------')
model = ols('ratio ~ algo', df).fit()
anovat = anova_lm(model)

Exemplo n.º 57

0

Exibir arquivo

print(arma_mod30.params)
print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)
# こっちを使うとモデル選択まで行われる(AICが他の方法と大きく異なる？)
# arma_mod30 = sm.tsa.AR(dta).fit(maxlag=15, ic='aic', disp=False)
# print(arma_mod30.params)
# print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)

# check if our model obeys the theory
resid = arma_mod30.resid  # residual
sm.stats.durbin_watson(resid.values)
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
ax = arma_mod30.resid.plot(ax=ax)

# test if the residual obeys the normal distribution
print(stats.normaltest(resid))
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)

# autocorrelation function and PARCOR of residual
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

r, q, p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
data = np.c_[range(1, 41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))

Exemplo n.º 58

0

Exibir arquivo

    def evaluate_deviation_from_mean(
            self,
            results_reshaped: np.ndarray,
            result_averaged: np.ndarray,
            value_input: Union[float, np.ndarray],
            no_points_averaged: int = 1) -> np.ndarray:
        dev = results_reshaped - np.tile(
            np.expand_dims(result_averaged, axis=1), (1, no_points_averaged))
        if np.ndim(value_input) == 1:
            plt.figure()
            value_input_formatted = np.expand_dims(value_input, axis=1)
            plt.plot(np.tile(value_input_formatted,
                             reps=(1, no_points_averaged)),
                     dev,
                     marker='.')

            # n_param_values = np.size(dev, axis=0) + 1
            # ax1 = plt.subplot(n_param_values, 1, 1)
            for i in range(0, np.size(dev, axis=0)):
                h3 = plt.figure()
                # plt.subplot(n_param_values, 1, i + 1).\
                plt.hist(dev[i, :],
                         stacked=False,
                         label=str(value_input[i]),
                         density=True)
                # plt.xlim([-0.1, 0.1])
                plt.title(str(value_input[i]))
                if self.get_data_saver() is not None:
                    self.get_data_saver().save_figure(
                        h3, ("deviations_over_parameter%d" % i))

        # else:
        # value_input_formatted = value_input

        h1 = plt.figure()
        plt.hist(np.ravel(dev), 100, density='true')
        mean_est = np.mean(np.ravel(dev))
        std_est = np.std(np.ravel(dev))
        x = np.linspace(np.min(np.ravel(dev)), np.max(np.ravel(dev)))
        random_normal = stats.norm(mean_est, std_est).pdf(x)
        plt.plot(x, random_normal, '--r', label='Fitted normal distribution')
        plt.legend()
        if not self.get_data_saver() is None:
            self.get_data_saver().save_figure(h1,
                                              "histogram_deviations_vadere")
        # plt.savefig('histogram_deviations_vadere.png')

        sm.qqplot(np.ravel(dev), line='s')
        h2 = plt.gcf()
        # plt.savefig('qqplot_deviations_vadere.png')
        if not self.get_data_saver() is None:
            self.get_data_saver().save_figure(h2, "qqplot_deviations_vadere")

        plt.close(h2)

        vadere_logger = logging.getLogger(
            "vaderemodel.evaluate_deviation_from_mean")
        vadere_logger.info("Vadere evaluations: Deviations from average")
        vadere_logger.info("Mean: %f, Std: %f" % (mean_est, std_est))

        # skewtest needs at least 8 samples
        if len(np.ravel(dev)) >= 20:
            alpha = 0.01
            k2, p = stats.normaltest(np.ravel(dev))
            vadere_logger.info("p = {:g}".format(p))
            if p < alpha:  # null hypothesis: x comes from a normal distribution
                vadere_logger.info("The null hypothesis can be rejected")
            else:
                vadere_logger.info("The null hypothesis cannot be rejected")

        return dev

Exemplo n.º 59

0

Exibir arquivo

            sleep(pause_time)

###############################################################################
# Update Volume and Diff
###############################################################################
if 1:

    diff = cu - cu.shift(1)
    diff = diff.dropna()
    values = diff.eur.values

    step = 20
    alpha = 1e-6
    for i in range(step, values.shape[0], step):
        vals = values[values.shape[0] - step:]
        k2, p = stats.normaltest(vals)
        if p < alpha:  # null hypothesis: x comes from a normal distribution
            print('{}: break'.format(i))
            #print("The null hypothesis can be rejected")
        else:
            #print("The null hypothesis cannot be rejected")
            if i % 3000 == 0:
                print(i)
    os.system('say "Completed"')

###############################################################################
# Plot Everything.  Ratios first
###############################################################################
if 1:

    cu.eur.plot(title='cu')

Exemplo n.º 60

0

Exibir arquivo

Arquivo: F11_7_regModels.py Projeto: ashhher3/statsintro_python

    b = (6.0 * (n**2.0 - 5.0 * n + 2.0)) / ((n + 7.0) * (n + 9.0))
    b *= np.sqrt((6.0 * (n + 3.0) * (n + 5.0)) / (n * (n - 2.0) * (n - 3.0)))
    A = 6.0 + (8.0 / b) * (2.0 / b + np.sqrt(1.0 + 4.0 / b**2.0))
    z = (1.0 - 2.0 / A) / (1.0 + X * np.sqrt(2.0 / (A - 4.0)))
    z = (1.0 - 2.0 / (9.0 * A)) - z**(1.0 / 3.0)
    z /= np.sqrt(2.0 / (9.0 * A))
    return z


K2 = Z1(S, N)**2.0 + Z2(K, N)**2.0
print('Omnibus: {}'.format(K2))

p = 1.0 - stats.chi2(2).cdf(K2)
print('Pr( Omnibus ) = {}'.format(p))

(K2, p) = stats.normaltest(result.resid)
print('Omnibus: {0}, p = {1}'.format(K2, p))

# ---------------------

JB = (N / 6.0) * (S**2.0 + (1.0 / 4.0) * (K - 3.0)**2.0)
p = 1.0 - stats.chi2(2).cdf(JB)
print('JB-statistic: {:.5f},  p-value: {:.5f}'.format(JB, p))

# ---------------------

X = np.matrix(X)
EV = np.linalg.eig(X * X.T)
print(EV)
CN = np.sqrt(EV[0].max() / EV[0].min())
print('Condition No.: {:.5f}'.format(CN))