-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
104 lines (84 loc) · 3.08 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
import csv
import numpy as np
import matplotlib.pylab as plt
import scipy.stats as st
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import ks_2samp
from scipy.stats import ksone
def theor_cdf(x, rel_scale):
return [(1 - np.exp(-f*rel_scale)) for f in x]
def theor_pdf(x, rel_scale):
return [rel_scale*np.exp(-rel_scale * f) for f in x ]
def read_unix_csv(filename):
data = []
with open(filename, newline='') as csvfile:
reader = csv.reader(csvfile, 'unix')
for row in reader:
for number in row:
data.append(float(number))
return data
def d_stat(data1, data2):
ecdf_1 = ECDF(data1)
x = sorted(data2)
d_min = []
d_max = []
data_len = np.size(data2)
for i in range(data_len):
y = ecdf_1(x[i])
d_min.append((i+1)/data_len - y)
d_max.append(y - i/data_len)
return np.max(d_min), np.max(d_max)
data1 = np.array(read_unix_csv('9_data1.csv'))
plt.figure(50)
mean = data1.mean()
std = data1.std(ddof=1)
rel_scale_coeff = 1 / mean;
print('mean: {}, s: {}, l: {}'.format(mean, std, rel_scale_coeff))
hist, bin_edges = np.histogram(data1, 'sturges')
midpoints = (bin_edges[1:] + bin_edges[:-1])/2
x_tdf = np.arange(0.0, np.max(bin_edges), 0.001)
fig0 = plt.figure(0)
plt.title('Histogram')
plt.xticks(bin_edges)
plt.hist(data1, bins='sturges',color='gray', edgecolor='black')
bins_width = (data1.max() - data1.min()) / (np.size(bin_edges) - 1)
print('bins width: {}'.format(bins_width))
plt.grid(b=True, which='both',axis='y')
#plt.savefig('hist.png', dpi=900)
plt.figure(10)
plt.title('PDF Histogram')
plt.xticks(bin_edges)
hist_norm, _ = np.histogram(data1, 'sturges', density=True)
plt.hist(data1, bins='sturges',color='gray', edgecolor='black', density=True)
plt.plot(midpoints, hist_norm, color='black')
tpdf = theor_pdf(x_tdf, rel_scale_coeff)
plt.plot(x_tdf, tpdf, ls='dashed', color='r')
plt.title('PDF histogram')
plt.grid(b=True, which='both',axis='y')
#plt.savefig('pdf_hist.png', dpi=900)
probs = [x/np.sum(hist) for x in hist]
cdf = np.cumsum(probs, dtype=float)
plt.figure(20)
plt.step(midpoints, cdf, where='post', color='black')
plt.grid(b=True, which='both',axis='both')
plt.xticks(midpoints)
plt.title('ECDF')
tcdf = theor_cdf(x_tdf, rel_scale_coeff)
plt.plot(x_tdf, tcdf, ls='dashed', color='r')
#plt.savefig('ecdf.png', dpi = 900)
ct = np.array([ x *(np.sum(hist)*bins_width) for x in theor_pdf(midpoints, rel_scale_coeff)])
chi2_emp = np.sum(((hist - ct)**2)/ct)
chi2_crit = st.chi2.ppf(0.99, np.size(bin_edges) - 2)
check_hip = chi2_crit > chi2_emp
print('chi2 crit: {}, chi2 emp: {}'.format(chi2_crit, chi2_emp))
data2 = np.array(read_unix_csv('9_data2.csv'))
dmin, dmax = d_stat(data1, data2)
dstat_emp = np.max([dmin, dmax])
ks_crit = ksone.ppf(0.6)
n, m = np.size(data1), np.size(data2)
print('Dm,n : {}, Kcrit: {}'.format(dstat_emp, ks_crit))
print('D * sqrt [mn / (m + n)] : {}'.format(dstat_emp * np.sqrt(m * n / (m + n))))
dstat, pval = ks_2samp(data1, data2)
print(dstat)
print('d1 and d2 is same with prob: ', pval)