/
2 - Linear Regression.py
46 lines (38 loc) · 1.25 KB
/
2 - Linear Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
__author__ = 'ylucas'
import pandas
from scipy.stats import rankdata
from scipy.stats.mstats import count_tied_groups
import numpy as np
from numpy import ndarray
import numpy.ma as ma
from scipy.lib.six import iteritems
import scipy.special as special
# CSV File location
csv_file = "data.csv"
data = pandas.read_csv(csv_file)
# Create 2 dataframes for rainy and non-rainy days
x = data['ENTRIESn_hourly'][data['rain'] == 0]
y = data['ENTRIESn_hourly'][data['rain'] == 1]
# As there was an issue with the scipy.stats.mannwhitneyu(y,x) which gave a nan p-value, I had to copy/paste the
# calculating functions instead of using directly scipy.stats.mannwhitneyu(x,y)
use_continuity = True
x = ma.asarray(x).compressed().view(ndarray)
y = ma.asarray(y).compressed().view(ndarray)
ranks = rankdata(np.concatenate([x,y]))
(nx, ny) = (len(x), len(y))
nt = nx + ny
U = ranks[:nx].sum() - nx*(nx+1)/2.
U = max(U, nx*ny - U)
u = nx*ny - U
#
mu = (nx*ny)/2.
sigsq = (nt**3 - nt)/12.
ties = count_tied_groups(ranks)
sigsq -= np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/12.
sigsq *= nx*ny/float(nt*(nt-1))
if use_continuity:
z = (U - 1/2. - mu) / ma.sqrt(sigsq)
else:
z = (U - mu) / ma.sqrt(sigsq)
prob = special.erfc(abs(z)/np.sqrt(2))
print "p-value : " + str(prob)