Пример #1
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     pivot_table = self._data_frame.stat.crosstab(
         "{}".format(targetDimension), testDimension)
     # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
     rdd = list(
         chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
     data_matrix = Matrices.dense(pivot_table.count(),
                                  len(pivot_table.columns) - 1, rdd)
     result = Statistics.chiSqTest(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result.statistic
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
Пример #2
0
    def test_matrix_independence(self):
        data = [
            40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0,
            12.0
        ]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          col_zero)
Пример #3
0
 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])
Пример #4
0
 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])
Пример #5
0
    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3,
                          expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_obs,
                          expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, zero_observed,
                          zero_expected)
Пример #6
0
    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(
            IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
Пример #7
0
def corrFilter(df,col,excludeCols,target):
    useFulCol = []
    corrScore = []
    for col in train.select(col).columns :
        if col not in excludeCols:
            if Statistics.chiSqTest(train.select('C2').collect()).pValue < 0.05:
                colCorr = float(str(train.stat.corr(col,target))[0:5])
                if colCorr > 0.03 or colCorr < -0.03:
                    useFulCol.append(col)
                    corrScore.append(colCorr)
    pearsonTable = pd.DataFrame({'colNmae':useFulCol,'pearson ':corrScore})
    pearsonTable.sort_values(by='spearman',ascending=False, inplace=True)
    return pearsonTable
Пример #8
0
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        df = self._data_frame.withColumn(
            testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
        measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values)
        if float(measureSummaryDict["count"]) > 10:
            maxval = float(measureSummaryDict["max"])
            minval = float(measureSummaryDict["min"])
            step = (maxval - minval) / 5.0
            splits = [
                math.floor(minval), minval + step, minval + (step * 2),
                minval + (step * 3), minval + (step * 4),
                math.ceil(maxval)
            ]
            bucketizer = Bucketizer(splits=splits,
                                    inputCol=testMeasure,
                                    outputCol="bucketedColumn")
            # bucketedData = bucketizer.transform(df)
            bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure))
            pivot_table = bucketedData.stat.crosstab(
                "{}".format(targetDimension), 'bucketedColumn')
        else:
            pivot_table = df.stat.crosstab("{}".format(targetDimension),
                                           testMeasure)

        rdd = list(
            chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
        data_matrix = Matrices.dense(pivot_table.count(),
                                     len(pivot_table.columns) - 1, rdd)
        result = Statistics.chiSqTest(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result.statistic
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result
Пример #9
0
    def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
Пример #10
0
    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
Пример #11
0
    def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
Пример #12
0
print "Converting bigrams to sparse vectors in a dataframe for the train set"
t0 = time()
features=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
features.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[323]:

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
print "Computing the chi vector"
t0 = time()
labeledPoints = features.map(lambda row : LabeledPoint(row.label, row.bigramVectors))
chi = Statistics.chiSqTest(labeledPoints)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[324]:

print "Starting bigram selection,broadcasting the newly created bigram dictionary"
t0 = time()
biSelect = [revDict_broad.value[i] for i,bigram in enumerate(chi) if bigram.pValue <=0.3]
dictSelect = {}
for i,bigram in enumerate(biSelect):
    dictSelect[bigram]=i
dictSel_broad = sc.broadcast(dictSelect)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
Пример #13
0
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3,
                        0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(
        3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)
Пример #14
0
print(summary.variance())
print(summary.numNonzeros())
print(summary.max())
print(summary.min())
print(summary.count())
print(summary.normL1())
print(summary.normL2())

#correlation
x = sc.parallelize(np.random.randn(4, 1))
y = sc.parallelize(np.random.randn(4, 1))
print("Correlation :", str(Statistics.corr(x, y)))

#Chi-square
#For Vector
x = Vectors.dense(np.random.random_sample((5)))
y = Vectors.dense(np.random.random_sample((5)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)

# For Matrices
x = Matrices.dense(4, 2, np.random.random_sample((8)))
y = Matrices.dense(4, 2, np.random.random_sample((8)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)
Пример #15
0
allFeatures=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(['bigramVectors','label']).cache()

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[6]:

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics

print "Starting chi square test"
t0 = time()

labeledPoints = allFeatures.map(lambda row : LabeledPoint(row.label, row.bigramVectors)).cache()
chi = Statistics.chiSqTest(labeledPoints)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


## In[20]:
#
#import pandas as pd
#pd.set_option('display.max_colwidth', 30)
#
#records = [(result.statistic, result.pValue) for result in chi]
#index=[revDict_broad.value[i] for i in range(len(revDict_broad.value))]
#chi_df = pd.DataFrame(data=records, index=index, columns=["Statistic","p-value"])
#
#chi_df.sort_values("p-value")
Пример #16
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics


sc = SparkContext("local", "Rubbish")

"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4,5,0,3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
matrix = Matrices.dense(3,4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)


# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult 
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult
Пример #17
0
from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
data = sqlContext.createDataFrame(
    [(7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
     (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
     (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
    ["id", "features", "clicked"])
selector = Statistics.chiSqTest(
    data)  #ChiSqSelector is not available in Statistics instead chiSqTest
Пример #18
0
      .map(lambda x: (x[0], x[1]/daly_date))\
      .sortBy(lambda x: x[0])\
      .map(lambda x: x[1])

    # get Emanuel dataset
    emanuel = lines.filter(lambda x: datetime.strptime(x[2][0:10], '%m/%d/%Y')
                           >= datetime.strptime("5/16/2011", '%m/%d/%Y'))
    emanuel_date = len(
        emanuel.map(lambda x: (x[2][0:2] + x[2][5:10])).distinct().collect())
    emanuel_final = emanuel.map(lambda x: (x[10][0:2], x[2][0:2] + x[2][5:10]))\
      .map(lambda x: (x,1))\
      .reduceByKey(lambda x, y: x+y)\
      .map(lambda x: (x[0][0], x[1]))\
      .reduceByKey(lambda x, y: (x+y))\
      .map(lambda x: (x[0], x[1]/emanuel_date))\
      .sortBy(lambda x: x[0])\
      .map(lambda x: x[1])

    # Convert to vector for Pyspark Chi Squared Formatting
    daly_vec = Vectors.dense(daly_final.collect())
    emanuel_vec = Vectors.dense(emanuel_final.collect())

    # Calculate Chi Squared Stat
    pearson = Statistics.chiSqTest(daly_vec, emanuel_vec)
    output = str(pearson)

    text_file = open("exercise2c.txt", "w")
    text_file.write(output)

    sc.stop()
Пример #19
0
from pyspark.mllib.linalg import Vectors, Matrices
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics

sc = SparkContext("local", "Rubbish")
"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4, 5, 0, 3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [
    40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0
]
matrix = Matrices.dense(3, 4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)

# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult
Пример #20
0
f = urllib.request.urlretrieve(url, localfile)

raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz')
csv = raw_data.map(lambda x: x.split(','))
duration = raw_data.map(lambda x: [int(x[0])])

from pyspark.mllib.stat import Statistics

summary = Statistics.colStats(duration)
summary.mean()[0]
summary.count()

metrics = csv.map(lambda x: [x[0], x[4], x[5]])
metrics.take(2)

Statistics.corr(metrics, method="spearman")

Statistics.corr(metrics, method="pearson")

from pyspark.mllib.linalg import Vectors

visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))

visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 8)
print(Statistics.chiSqTest(visitors_freq))

print(Statistics.chiSqTest(duration.collect()))

spark.stop()
Пример #21
0
def findFeatures(inputFileName, outputFileName):

    inpFile = sc.textFile(inputFileName)

    numRows = inpFile.count()

    print('\nRead ', numRows, ' rows from ', inputFileName, '\n')

    print('Print out a few rows read from file')

    print('\n', inpFile.take(5), '\n')

    # Rectangularize the RDD before vectorizing

    # Filter elements to remove quotes to prevent (quote) embedded commas

    countFields = inpFile.map(lambda s: removeEmbeddedCommas(s)).map(
        lambda s: len(s.split(','))).collect()

    print('number of fields in each row (first few): ', countFields[0:4])

    RectangularizationNeeded = False
    maxCount = 0
    maxCountAt = 0

    for i in range(len(countFields)):
        if (countFields[i] > maxCount):
            maxCount = countFields[i]
            maxCountAt = i
        if (i > 0) and (RectangularizationNeeded == False):
            if (countFields[i] != countFields[i - 1]):
                RectangularizationNeeded = True

    if (RectangularizationNeeded == True):
        print('Identified jagged data set; Rectangularization needed')
    else:
        print('Identified rectangular data set')

    print('Inferring longest row(s) has ', maxCount, ' fields at row ',
          maxCountAt)

    inpFileRe = inpFile.map(lambda s: removeEmbeddedCommas(s)).map(
        lambda s: s + ',No Data')
    # remove short rows
    shortFile = inpFileRe.filter(
        lambda row: len(row.split(',')) < maxCount + 1)
    print("Short rows will be filtered out")
    print('\n', shortFile.take(10), '\n')
    # truncate to maxCount+1 columns
    inpFileTr = inpFileRe.filter(
        lambda row: len(row.split(',')) == maxCount + 1)
    print('\n', inpFileTr.take(5), '\n')

    header = inpFileTr.first()
    hL = header.split(',')

    inpFileNh = inpFileTr.filter(lambda row: row != header)

    print('Removed the First row as Header')
    numRows = inpFileNh.count()
    print('number of rows = ', numRows)

    from pyspark.mllib.linalg import Matrix, Matrices
    from pyspark.mllib.linalg import Vector, Vectors

    # parsedData will be org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]
    parsedData = inpFileNh.map(
        lambda s: Vectors.dense([with0Str(t) for t in s.split(',')]))
    print('\nprint out a few vectors after converting from strings\n')
    print(parsedData.take(5))

    from pyspark.mllib.stat import MultivariateStatisticalSummary, Statistics

    summary = Statistics.colStats(parsedData)

    print('\nprint out summary statistics, for each column\n')

    print('summary.mean')
    print(summary.mean())
    print('summary.variance')
    print(summary.variance())
    print('summary.count')
    print(summary.count())
    print('summary.max')
    print(summary.max())
    print('summary.min')
    print(summary.min())
    print('summary.normL1')
    print(summary.normL1())
    print('summary.normL2')
    print(summary.normL2())
    print('summary.numnonZeros')
    print(summary.numNonzeros())
    print()

    numCols = len(summary.mean())

    typeStrings = [' '] * numCols

    # infer columns where normL1, normL2, mean, variance, max and mean are 0 as non-numeric

    print('Inferring column data types:')

    import math

    for j in range(numCols):
        if ((summary.normL1()[j] == 0.0) and (summary.normL2()[j] == 0.0)
                and (summary.mean()[j] == 0.0)
                and (summary.variance()[j] == 0.0)
                and (summary.max()[j] == 0.0) and (summary.min()[j] == 0.0)):
            typeStrings[j] = 'String'
        else:
            if ((math.trunc(summary.normL1()[j]) == summary.normL1()[j])
                    and (math.trunc(summary.max()[j]) == summary.max()[j])
                    and (math.trunc(summary.min()[j]) == summary.min()[j])):
                typeStrings[j] = 'Int'
            else:
                typeStrings[j] = 'Float'

        print(typeStrings[j], end=',')

    print('\n\n')

    #******************************************************************************
    # take out the 'String' columns before calling Statistics.corr()

    numNumericCols = 0
    for j in range(numCols):
        if (typeStrings[j] != 'String'):
            numNumericCols = numNumericCols + 1

    noStrings = inpFileNh.map(
        lambda s: Vectors.dense(removeStrings(s, numNumericCols)))
    print(noStrings.take(5))

    correlMatrix = Statistics.corr(noStrings, method='pearson')

    print('Computing Correlation Matrix on all columns')
    print(
        'Printing out column names that have correlation coefficient > 0.5 or < -0.5'
    )

    for i in range(numNumericCols):
        for j in range(i):
            if (((correlMatrix[i][j] >= 0.5) or (correlMatrix[i][j] <= -0.5))
                    and (i != j)):
                print(hA[i], hA[j], correlMatrix[i][j])

#******************************************************************************
#******************************************************************************

# create a contingency matrix

    LoLoF = [[0.0 for x in range(numNumericCols)] for y in range(numRows)]

    LoLoF = noStrings.collect()

    pdLinArr = [0.0 for x in range(numNumericCols * numRows)]

    for i in range(numRows):
        for j in range(numNumericCols):
            pdLinArr[i * numNumericCols + j] = abs(LoLoF[i][j])

    mat = Matrices.dense(numRows, numNumericCols, pdLinArr)

    # conduct Pearson's independence test on the input contingency matrix
    print(
        "Computing Pearson's independence test on the input contingency matrix using chi-square test"
    )

    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom
    print('%s\n' % independenceTestResult)

    #*******************************************************************************

    stdDev = [0.0] * numCols

    for j in range(numCols):
        stdDev[j] = math.sqrt(summary.variance()[j])

#*******************************************************************************
#   test for normal distribution using Kolmogorov-Smirnov test
#
    colVec = [0.0] * numRows

    #vecRDD = sc.parallelize(colVec)
    #testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', 0, 1)
    #print(testResult)

    numericMean = [0.0] * numNumericCols
    numericSD = [0.0] * numNumericCols

    k = 0
    for j in range(numCols):
        if ((summary.mean()[j] != 0.0) and (summary.variance()[j] != 0.0)):
            numericMean[k] = summary.mean()[j]
            numericSD[k] = stdDev[j]
            k = k + 1

    print(
        'Checking if column data is normally distributed using Kolmogorov-Smirnov test'
    )

    for j in range(numNumericCols):
        for i in range(numRows):
            # see https://issues.apache.org/jira/browse/SPARK-20802
            # test fails if data is normally distributed
            # kolmogorovSmirnovTest in pyspark.mllib.stat.Statistics throws net.razorvine.pickle.PickleException
            # when input data is normally distributed (no error when data is not normally distributed)
            colVec[i] = float(i)  # LoLoF[i][j]
        vecRDD = sc.parallelize(colVec)
        print(colVec[0], colVec[numRows - 1], numericMean[j], numericSD[j])
        testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm',
                                                      numericMean[j],
                                                      numericSD[j])
        print(testResult)

#*******************************************************************************
#*******************************************************************************
#
#   estimate kernel densities
#
    from pyspark.mllib.stat import KernelDensity

    # colVec = [0.0]*numRows
    # vecRDD = sc.parallelize(colVec)

    print('Computing kernel densities on all columns using a Bandwidth of 3.0')

    kd = KernelDensity()
    kd.setSample(vecRDD)
    kd.setBandwidth(3.0)

    sAS = int(math.sqrt(numRows))  # sample array size
    samplePoints = [0.0] * sAS
    #samplePoints = [0.0]*numRows

    for i in range(sAS):
        samplePoints[i] = float(i * sAS)
    #for i in range(numRows):
    #   samplePoints[i] = float(i)

    densities = kd.estimate(samplePoints)

    print('Estimating kernel densities')

    print('Print kernel densities at sample points')
    #print('Print kernel densities > 0.01 at sample points')
    for j in range(numNumericCols):
        # print( hL[j])
        for i in range(numRows):
            # see https://issues.apache.org/jira/browse/SPARK-20803
            # KernelDensity.estimate in pyspark.mllib.stat.KernelDensity throws
            # net.razorvine.pickle.PickleException when input data is normally
            # distributed (no error when data is not normally distributed)
            colVec[i] = float(i)  # LoLoF[i][j]
        vecRDD = sc.parallelize(colVec)
        kd = KernelDensity()
        kd.setSample(vecRDD)
        kd.setBandwidth(3.0)
        # Find density estimates for the given values
        densities = kd.estimate(samplePoints)
        for i in range(sAS):
            print(densities[i], end=',')
        print()
        #for i in range(numRows):
        #   if (densities[i] >= 0.01):
        #       print(i, densities[i], end=',')
        print()

#*******************************************************************************

#*******************************************************************************
#
#  compute Skewness and Kurtosis for each numeric column
#
    skew = [0.0] * numNumericCols
    kurt = [0.0] * numNumericCols
    term = 0.0

    k = 0
    for j in range(numCols):
        if (typeStrings[j] != 'String'):
            skew[k] = 0.0
            kurt[k] = 0.0
            # extra work: find Ints
            typeStrings[j] = 'Int'
            meanj = summary.mean()[j]
            for i in range(numRows):
                if ((typeStrings[j] == 'Int')
                        and (math.trunc(LoLoF[i][k]) != LoLoF[i][k])):
                    typeStrings[j] = 'Float'
                term = (LoLoF[i][k] - meanj) / stdDev[j]
                skew[k] = skew[k] + (term * term * term)
                kurt[k] = kurt[k] + (term * term * term * term)
            skew[k] = skew[k] / numRows
            kurt[k] = (kurt[k] / numRows) - 3.0
            k = k + 1

    print('Skewness of columns')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            print(skew[k], end=',')
            k = k + 1
    print()

    print('Kurtosis of columns')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            print(kurt[k], end=',')
            k = k + 1
    print()

    print('Inferring column data types (Text string, Int, Float)')

    # numbers that are Int and non-negative and  "large" are likely to be numeric labels -- keep checking this heuristic
    # columns that are outside Kurtosis limits <-1.2, 3.0> may be numeric labels

    print('Attempting to infer if an Int column is a numeric label')
    print("If all Ints in a column are >= 0 and 'large', it may be numLabel")
    print(
        'If all Ints in a column are >= 0 and excess kurtosis is outside [-1.2, 3.0], it may be numLabel'
    )

    for j in range(numCols):
        if ((typeStrings[j] == 'Int') and (summary.min()[j] >= 0)
                and ((summary.max()[j] > 10000) or (kurt[j] < -1.2) or
                     (kurt[j] > 3.0))):
            print('column ' + j + ' (' + hA[j] + ') ' +
                  ' may be a numeric label')
            typeStrings[j] = 'NumLabel'


#******************************************************************************
#******************************************************************************
#
#   Normalize the dataset by shifting by mean and scaling by stdDev
#
    normData = [[0.0 for x in range(numNumericCols)] for y in range(numRows)]
    rowMaxs = [0.0] * numRows
    rowMins = [0.0] * numRows
    rowNormL1s = [0.0] * numRows
    rowNormL2s = [0.0] * numRows
    rowNumZeros = [0] * numRows
    means = [0.0] * numCols

    for j in range(numCols):
        means[j] = summary.mean()[j]

    for i in range(numRows):
        rowMaxs[i] = -999999.0
        rowMins[i] = 999999.0
        rowNumZeros[i] = 0
        rowNormL1s[i] = 0.0
        rowNormL2s[i] = 0.0

        k = 0
        for j in range(numCols):
            if ((typeStrings[j] == 'Int') or (typeStrings[j] == 'Float')):
                normData[i][k] = (LoLoF[i][k] - means[j]) / stdDev[j]
                if (normData[i][k] > rowMaxs[i]):
                    rowMaxs[i] = normData[i][k]
                if (normData[i][k] < rowMins[i]):
                    rowMins[i] = normData[i][k]
                if (normData[i][k] == 0.0):
                    rowNumZeros[i] = rowNumZeros
                if (abs(normData[i][k]) < 100.0):
                    rowNormL1s[i] = rowNormL1s[i] + abs(normData[i][k])
                    rowNormL2s[
                        i] = rowNormL2s[i] + normData[i][k] * normData[i][k]
            # print(i,j,k, LoLoF[i][k], means[j], stdDev[j], normData[i][k], rowNormL1s[i], rowNormL2s[i])
                k = k + 1

    input = open(inputFileName, 'r')
    fileHandle = open('/home/bsrsharma/work/python/rowNormL1L2.csv', 'w')

    # Keep upto 6 columns of identifying info
    if (numCols > 1):
        for j in range(min(5, numCols)):
            fileHandle.write(hL[j])
            fileHandle.write(',')
    fileHandle.write('L1-Norm')
    fileHandle.write(",")
    fileHandle.write('L2-Norm\n')

    s = input.readline()  # don't repeat header

    for i in range(numRows):
        # copy input to output
        s = input.readline()
        LoS = s.split(',')
        for j in range(min(5, numCols)):
            fileHandle.write(LoS[j])
            fileHandle.write(',')
        fileHandle.write('%s' % rowNormL1s[i])
        fileHandle.write(',')
        fileHandle.write('%s' % math.sqrt(rowNormL2s[i]))
        fileHandle.write('\n')

    fileHandle.close()
    input.close()

    print('Wrote ', 'rowNormL1L2.csv')

    input = open(inputFileName, 'r')
    fileHandle = open(outputFileName, 'w')

    # output normalized data
    numCols = numCols - 1
    # write header row
    if (numCols > 1):
        for j in range(numCols - 1):
            fileHandle.write(hL[j])
            fileHandle.write(',')
    fileHandle.write(hL[numCols - 1])
    fileHandle.write('\n')

    s = input.readline()  # don't repeat header

    for i in range(numRows):
        # copy input to output
        s = input.readline()
        LoS = s.split(',')
        k = 0
        for j in range(numCols - 1):
            if (typeStrings[j] == 'String'):
                fileHandle.write(LoS[j])
            else:
                fileHandle.write('%s' % normData[i][k])
                k = k + 1
            fileHandle.write(',')
        if (typeStrings[numCols - 1] == 'String'):
            fileHandle.write(LoS[numCols - 1])
        else:
            fileHandle.write('%s' % normData[i][k])
        fileHandle.write('\n')

    fileHandle.close()
    input.close()

    print('Wrote ', outputFileName, '\n')

    #******************************************************************************

    # compute median for each column

    medians = [0.0] * numNumericCols
    aCol = [0.0] * numRows

    for j in range(numNumericCols):
        for i in range(numRows):
            aCol[i] = LoLoF[i][j]
        aCol.sort()

        medians[j] = aCol[numRows / 2]

    print('medians:')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            print(medians[k], end=',')
            k = k + 1
    print('\n\n')

    # compute histograms for each column

    numBins = int(math.sqrt(numRows))
    histogram = [0] * (numBins + 1)
    binWidth = 0
    mins = [0.0] * numCols
    maxs = [0.0] * numCols

    print('Computing histograms for numeric columns')
    print('choosing ', numBins, ' bins')

    k = 0

    for j in range(numCols):
        mins[j] = summary.min()[j]
        maxs[j] = summary.max()[j]
        if (typeStrings[j] == 'String'):
            print('column ', j, '( ', hL[j], ' ): Text')
        else:
            binWidth = (maxs[j] - mins[j]) / numBins
            for i in range(numBins):
                histogram[i] = 0
            for i in range(numRows):
                histogram[int((LoLoF[i][k] - mins[j]) / binWidth)] += 1
            print('column ', j, '( ', hL[j], ' ):')
            if (typeStrings[j] == 'NumLabel'):
                print('NumLabel')
            for i in range(numBins):
                print(histogram[i], end=',')
            print()
            k = k + 1
    print('\n\n')

    # compute modes

    modes = [0.0] * numNumericCols
    largestBin = 0
    binIndex = 0

    print('modes:')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            largestBin = 0
            binIndex = 0
            for i in range(numBins):
                # pick the bin with most items
                if (histogram[i] > largestBin):
                    binIndex = i
            modes[k] = mins[j] + (maxs[j] - mins[j]) * binIndex / numBins
            print(modes[k], end=',')
            k = k + 1
    print('\n\n')

    return 0
sc = SparkContext(conf=conf)

#sentence = "hello hello world"
#words = sentence.split() # Split sentence into a list of terms
#tf = HashingTF(10000) # Create vectors of size S = 10,000
#tf.transform(words)
#SparseVector(10000, {3065: 1.0, 6861: 2.0})


rdd = sc.wholeTextFiles("dracula.txt").map(lambda (name, text): text.split())
tf = HashingTF(10000)
tfVectors = tf.transform(rdd).cache()


# Compute the IDF, then the TF-IDF vectors
idf = IDF(10000)
idfModel = idf.fit(tfVectors)

tfIdfVectors = idfModel.transform(tfVectors)


Statistics.colStats(tfIdfVectors)

Statistics.corr(tfIdfVectors)
Statistics.corr(tfIdfVectors,"spearman")

Statistics.corr(tfVectors,tfIdfVectors)

Statistics.chiSqTest(tfIdfVectors)

from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize(
Пример #24
0
from pyspark.mllib.stat import Statistics
mat = sc.parallelize(
    [np.array([10.1,12.4,14.5,16.8,21]),np.array([21.3,24.2,35.4,36.4,31.7]),np.array([21.1,23.,54.,65.,71.])]
)
summary=Statistics.colStats(mat)
summary.mean()
summary.variance()
summary.numNonzeros()
X = sc.parallelize([10.1,12.4,14.5,16.8,21])
Y = sc.parallelize([21.3,24.2,35.4,36.4,31.7])
corr = Statistics.corr(X,Y,method='pearson')
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
vec = Vectors.dense(10.1,12.4,14.5,16.8,21,21.3,24.2,35.4,36.4,31.7)
goodnestest = Statistics.chiSqTest(vec)
print (goodnestest)
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None, sep=',')
from pyspark.mllib.regression import LabeledPoint, LinearRegressionModel, LinearRegressionWithSGD
sc.stop()
sc = SparkContext(appName='MLAlgo')
data = sc.textFile("C:\Users\Dell\Documents\winequality.csv") \
       .map(lambda line: line.split(",")) \
    .filter(lambda line: len(line)>1)\
    .map(lambda line: (line[0],line[3],line[2]))\
    .collect()
print (data)
parsed_data = [LabeledPoint(0.0,[14.23,1.71,2.43,15.6]),
              LabeledPoint(0.0,[13.2,1.78,2.14,11.2]),
              LabeledPoint(1.0,[21.3,32.4,3.5,21.4]),
              LabeledPoint(1.0,[12.4,21.4,21.7,32.8]),
Пример #25
0
from pyspark.mllib.linalg import Vectors

def mapper(x):
    return 0,abs( float(x) )

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: spark-submit lr.py <input residuals file> <output file>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="Chi Squared residuals")

    lines = sc.textFile(sys.argv[1], 1)

    resid = lines.map(mapper)

    resid = resid.collect()

    residuals = []

    for r in resid:
	    residuals.append(r[1])

    vec = Vectors.dense(residuals)

    gft = Statistics.chiSqTest(vec)

    print("%s\n" % gft)

    sc.stop()
Пример #26
0
# now run chi-square
def bin_data(true, pred, num_bins):
    min_val = np.array([true.min(), pred.min()]).min()
    max_val = np.array([true.max(), pred.max()]).max()
    true_binned = np.histogram(true, num_bins, (min_val, max_val))[0]
    pred_binned = np.histogram(pred, num_bins, (min_val, max_val))[0]
    return true_binned, pred_binned


from pyspark.mllib.linalg import Vectors

v, p = bin_data(valuesAndPreds_df['_1'].values, valuesAndPreds_df['_2'], 200)
v = Vectors.dense(v)
p = Vectors.dense(p)
pearson = Statistics.chiSqTest(v, p)
pearson.pValue
pearson.statistic

# ================================================================================
#                              Ridge regression
# ================================================================================

# fit an ridge
lr_ridge = LinearRegressionWithSGD.train(temp,
                                         1000,
                                         .2,
                                         intercept=True,
                                         regType='l2')

# evaluate the model on training data
Пример #27
0
]


def parse_interaction_categorical(line):
    line_split = line.split(",")
    clean_line_split = line_split[6:41]
    attack = 1.0
    if line_split[41] == 'normal.':
        attack = 0.0
    return LabeledPoint(attack, np.array([float(x) for x in clean_line_split]))


training_data_categorical = raw_data.map(parse_interaction_categorical)

from pyspark.mllib.stat import Statistics
chi = Statistics.chiSqTest(training_data_categorical)

import pandas as pd
pd.set_option('display.max_colwidth', 30)
records = [(result.statistic, result.pValue) for result in chi]
chi_df = pd.DataFrame(data=records,
                      index=feature_names,
                      columns=["Statistic", "p-value"])
print(chi_df)


def parse_interaction_chi(line):
    line_split = line.split(",")
    # leave_out = [1,2,3,19,20.41]
    clean_line_split = line_split[0:1] + line_split[4:19] + line_split[21:41]
    attack = 1.0