-
Notifications
You must be signed in to change notification settings - Fork 0
/
StatisticalTest.py
107 lines (88 loc) · 4.04 KB
/
StatisticalTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.feature import VectorIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.stat import Correlation
class StatisticalTest():
def __init__(self,dataset,features,labelColm):
self.dataset=dataset
self.features=features
self.labelColm=labelColm
def pearsonTest(self):
dataset=self.dataset
labelColm=self.labelColm
features=self.features
labelColm = [labelColm]
allColms = labelColm + features
featureAssembler = VectorAssembler(
inputCols=allColms, outputCol="allColmsVectorized", handleInvalid="skip")
allColmsVectorizedDataset = featureAssembler.transform(dataset)
allColmsVectorizedDataset.show()
r1p = Correlation.corr(allColmsVectorizedDataset, "allColmsVectorized").head()
print("pearson correlation matrix : \n : " + str(r1p[0]))
pearson_matrix = r1p[0].toArray().tolist()
pearsonMatrix = []
for everylist in pearson_matrix:
insideList = []
for listinlist in everylist:
insideList.append(round(listinlist, 4))
pearsonMatrix.append(insideList)
pearson_value_d = []
for x in r1p[0].toArray():
pearson_value_d.append(round(x[0], 4))
# pearson_value_d.append(x[0])
pearson_value = {}
for col, val in zip(allColms, pearson_value_d):
pearson_value[col] = val
print(pearson_value)
#
# r1s = Correlation.corr(allColmsVectorizedDataset, "allColmsVectorized", "spearman").head()
# print(" spearman correlation...: \n" + str(r1s[0]))
result_pearson = {'pearson_value': pearson_value,
'matrix': pearsonMatrix}
return result_pearson
def chiSquareTest(self,categoricalFeatures,maxCategories):
dataset=self.dataset
labelColm=self.labelColm
features=self.features
length = features.__len__()
featureassembler = VectorAssembler(
inputCols=self.features,
outputCol="featuresChiSquare", handleInvalid="skip")
dataset= featureassembler.transform(dataset)
vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories,
handleInvalid="skip").fit(dataset)
categorical_features = vec_indexer.categoryMaps
print("Chose %d categorical features: %s" %
(len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))
dataset = vec_indexer.transform(dataset)
# finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure')
# finalized_data.show()
# using chi selector
selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure",
outputCol="selectedFeatures",
labelCol=labelColm)
result = selector.fit(dataset).transform(dataset)
print("chi2 output with top %d features selected " % selector.getNumTopFeatures())
result.show()
# runnin gfor the chi vallue test
r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head()
p_values = list(r.pValues)
PValues = []
for val in p_values:
PValues.append(round(val, 4))
print(PValues)
dof = list(r.degreesOfFreedom)
stats = list(r.statistics)
statistics = []
for val in stats:
statistics.append(round(val, 4))
print(statistics)
chiSquareDict = {}
for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures):
print(pval, doF, stat)
chiSquareDict[colm] = pval, doF, stat
chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
print(chiSquareDict)
result = {'pvalues': chiSquareDict}
return result