def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def test_matrix_independence(self): data = [ 40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0 ] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)
def test_right_number_of_results(self): num_cols = 1001 sparse_data = [ LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) ] chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) self.assertEqual(len(chi), num_cols) self.assertIsNotNone(chi[1000])
def test_right_number_of_results(self): num_cols = 1001 sparse_data = [ LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) ] chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) self.assertEqual(len(chi), num_cols) self.assertIsNotNone(chi[1000])
def test_goodness_of_fit(self): from numpy import inf observed = Vectors.dense([4, 6, 5]) pearson = Statistics.chiSqTest(observed) # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))` self.assertEqual(pearson.statistic, 0.4) self.assertEqual(pearson.degreesOfFreedom, 2) self.assertAlmostEqual(pearson.pValue, 0.8187, 4) # Different expected and observed sum observed1 = Vectors.dense([21, 38, 43, 80]) expected1 = Vectors.dense([3, 5, 7, 20]) pearson1 = Statistics.chiSqTest(observed1, expected1) # Results validated against the R command # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))` self.assertAlmostEqual(pearson1.statistic, 14.1429, 4) self.assertEqual(pearson1.degreesOfFreedom, 3) self.assertAlmostEqual(pearson1.pValue, 0.002717, 4) # Vectors with different sizes observed3 = Vectors.dense([1.0, 2.0, 3.0]) expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0]) self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3) # Negative counts in observed neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_obs, expected1) # Count = 0.0 in expected but not observed zero_expected = Vectors.dense([1.0, 0.0, 3.0]) pearson_inf = Statistics.chiSqTest(observed, zero_expected) self.assertEqual(pearson_inf.statistic, inf) self.assertEqual(pearson_inf.degreesOfFreedom, 2) self.assertEqual(pearson_inf.pValue, 0.0) # 0.0 in expected and observed simultaneously zero_observed = Vectors.dense([2.0, 0.0, 1.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, zero_observed, zero_expected)
def test_goodness_of_fit(self): from numpy import inf observed = Vectors.dense([4, 6, 5]) pearson = Statistics.chiSqTest(observed) # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))` self.assertEqual(pearson.statistic, 0.4) self.assertEqual(pearson.degreesOfFreedom, 2) self.assertAlmostEqual(pearson.pValue, 0.8187, 4) # Different expected and observed sum observed1 = Vectors.dense([21, 38, 43, 80]) expected1 = Vectors.dense([3, 5, 7, 20]) pearson1 = Statistics.chiSqTest(observed1, expected1) # Results validated against the R command # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))` self.assertAlmostEqual(pearson1.statistic, 14.1429, 4) self.assertEqual(pearson1.degreesOfFreedom, 3) self.assertAlmostEqual(pearson1.pValue, 0.002717, 4) # Vectors with different sizes observed3 = Vectors.dense([1.0, 2.0, 3.0]) expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0]) self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3) # Negative counts in observed neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1) # Count = 0.0 in expected but not observed zero_expected = Vectors.dense([1.0, 0.0, 3.0]) pearson_inf = Statistics.chiSqTest(observed, zero_expected) self.assertEqual(pearson_inf.statistic, inf) self.assertEqual(pearson_inf.degreesOfFreedom, 2) self.assertEqual(pearson_inf.pValue, 0.0) # 0.0 in expected and observed simultaneously zero_observed = Vectors.dense([2.0, 0.0, 1.0]) self.assertRaises( IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
def corrFilter(df,col,excludeCols,target): useFulCol = [] corrScore = [] for col in train.select(col).columns : if col not in excludeCols: if Statistics.chiSqTest(train.select('C2').collect()).pValue < 0.05: colCorr = float(str(train.stat.corr(col,target))[0:5]) if colCorr > 0.03 or colCorr < -0.03: useFulCol.append(col) corrScore.append(colCorr) pearsonTable = pd.DataFrame({'colNmae':useFulCol,'pearson ':corrScore}) pearsonTable.sort_values(by='spearman',ascending=False, inplace=True) return pearsonTable
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab("{}".format(targetDimension), testMeasure) rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result
def test_chi_sq_pearson(self): data = [ LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), LabeledPoint(1.0, Vectors.dense([3.5, 40.0])) ] for numParts in [2, 4, 6, 8]: chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts)) feature1 = chi[0] self.assertEqual(feature1.statistic, 0.75) self.assertEqual(feature1.degreesOfFreedom, 2) self.assertAlmostEqual(feature1.pValue, 0.6873, 4) feature2 = chi[1] self.assertEqual(feature2.statistic, 1.5) self.assertEqual(feature2.degreesOfFreedom, 3) self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
def test_matrix_independence(self): data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
def test_chi_sq_pearson(self): data = [ LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), LabeledPoint(1.0, Vectors.dense([3.5, 40.0])) ] for numParts in [2, 4, 6, 8]: chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts)) feature1 = chi[0] self.assertEqual(feature1.statistic, 0.75) self.assertEqual(feature1.degreesOfFreedom, 2) self.assertAlmostEqual(feature1.pValue, 0.6873, 4) feature2 = chi[1] self.assertEqual(feature2.statistic, 1.5) self.assertEqual(feature2.degreesOfFreedom, 3) self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
print "Converting bigrams to sparse vectors in a dataframe for the train set" t0 = time() features=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema) features.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[323]: from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics print "Computing the chi vector" t0 = time() labeledPoints = features.map(lambda row : LabeledPoint(row.label, row.bigramVectors)) chi = Statistics.chiSqTest(labeledPoints) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[324]: print "Starting bigram selection,broadcasting the newly created bigram dictionary" t0 = time() biSelect = [revDict_broad.value[i] for i,bigram in enumerate(chi) if bigram.pValue <=0.3] dictSelect = {} for i,bigram in enumerate(biSelect): dictSelect[bigram]=i dictSel_broad = sc.broadcast(dictSelect) tt = time() - t0 print "Done in {} second".format(round(tt,3))
# $example on$ from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense( 3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % independenceTestResult)
print(summary.variance()) print(summary.numNonzeros()) print(summary.max()) print(summary.min()) print(summary.count()) print(summary.normL1()) print(summary.normL2()) #correlation x = sc.parallelize(np.random.randn(4, 1)) y = sc.parallelize(np.random.randn(4, 1)) print("Correlation :", str(Statistics.corr(x, y))) #Chi-square #For Vector x = Vectors.dense(np.random.random_sample((5))) y = Vectors.dense(np.random.random_sample((5))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis) # For Matrices x = Matrices.dense(4, 2, np.random.random_sample((8))) y = Matrices.dense(4, 2, np.random.random_sample((8))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis)
allFeatures=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(['bigramVectors','label']).cache() tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[6]: from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics print "Starting chi square test" t0 = time() labeledPoints = allFeatures.map(lambda row : LabeledPoint(row.label, row.bigramVectors)).cache() chi = Statistics.chiSqTest(labeledPoints) tt = time() - t0 print "Done in {} second".format(round(tt,3)) ## In[20]: # #import pandas as pd #pd.set_option('display.max_colwidth', 30) # #records = [(result.statistic, result.pValue) for result in chi] #index=[revDict_broad.value[i] for i in range(len(revDict_broad.value))] #chi_df = pd.DataFrame(data=records, index=index, columns=["Statistic","p-value"]) # #chi_df.sort_values("p-value")
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics sc = SparkContext("local", "Rubbish") """ # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) """ # Sample vector composing of frequency of events vect = Vectors.dense([4,5,0,3]) # Summary of the test including the p-value, degrees of freedom, goodnessOfFitTestResult = Statistics.chiSqTest(vect) sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] matrix = Matrices.dense(3,4, sampleData) # Conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(matrix) # Test statistic, the method used, and the null hypothesis. print "SINGLE VECTOR FIT: " print goodnessOfFitTestResult ## Summary of the test including the p-value, degrees of freedom. print "INDEPENDENCE TEST RESULT: " print independenceTestResult
from pyspark.mllib.stat import Statistics from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) data = sqlContext.createDataFrame( [(7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], ["id", "features", "clicked"]) selector = Statistics.chiSqTest( data) #ChiSqSelector is not available in Statistics instead chiSqTest
.map(lambda x: (x[0], x[1]/daly_date))\ .sortBy(lambda x: x[0])\ .map(lambda x: x[1]) # get Emanuel dataset emanuel = lines.filter(lambda x: datetime.strptime(x[2][0:10], '%m/%d/%Y') >= datetime.strptime("5/16/2011", '%m/%d/%Y')) emanuel_date = len( emanuel.map(lambda x: (x[2][0:2] + x[2][5:10])).distinct().collect()) emanuel_final = emanuel.map(lambda x: (x[10][0:2], x[2][0:2] + x[2][5:10]))\ .map(lambda x: (x,1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1]))\ .reduceByKey(lambda x, y: (x+y))\ .map(lambda x: (x[0], x[1]/emanuel_date))\ .sortBy(lambda x: x[0])\ .map(lambda x: x[1]) # Convert to vector for Pyspark Chi Squared Formatting daly_vec = Vectors.dense(daly_final.collect()) emanuel_vec = Vectors.dense(emanuel_final.collect()) # Calculate Chi Squared Stat pearson = Statistics.chiSqTest(daly_vec, emanuel_vec) output = str(pearson) text_file = open("exercise2c.txt", "w") text_file.write(output) sc.stop()
from pyspark.mllib.linalg import Vectors, Matrices from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics sc = SparkContext("local", "Rubbish") """ # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) """ # Sample vector composing of frequency of events vect = Vectors.dense([4, 5, 0, 3]) # Summary of the test including the p-value, degrees of freedom, goodnessOfFitTestResult = Statistics.chiSqTest(vect) sampleData = [ 40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0 ] matrix = Matrices.dense(3, 4, sampleData) # Conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(matrix) # Test statistic, the method used, and the null hypothesis. print "SINGLE VECTOR FIT: " print goodnessOfFitTestResult ## Summary of the test including the p-value, degrees of freedom. print "INDEPENDENCE TEST RESULT: " print independenceTestResult
f = urllib.request.urlretrieve(url, localfile) raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz') csv = raw_data.map(lambda x: x.split(',')) duration = raw_data.map(lambda x: [int(x[0])]) from pyspark.mllib.stat import Statistics summary = Statistics.colStats(duration) summary.mean()[0] summary.count() metrics = csv.map(lambda x: [x[0], x[4], x[5]]) metrics.take(2) Statistics.corr(metrics, method="spearman") Statistics.corr(metrics, method="pearson") from pyspark.mllib.linalg import Vectors visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3) print(Statistics.chiSqTest(visitors_freq)) visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 8) print(Statistics.chiSqTest(visitors_freq)) print(Statistics.chiSqTest(duration.collect())) spark.stop()
def findFeatures(inputFileName, outputFileName): inpFile = sc.textFile(inputFileName) numRows = inpFile.count() print('\nRead ', numRows, ' rows from ', inputFileName, '\n') print('Print out a few rows read from file') print('\n', inpFile.take(5), '\n') # Rectangularize the RDD before vectorizing # Filter elements to remove quotes to prevent (quote) embedded commas countFields = inpFile.map(lambda s: removeEmbeddedCommas(s)).map( lambda s: len(s.split(','))).collect() print('number of fields in each row (first few): ', countFields[0:4]) RectangularizationNeeded = False maxCount = 0 maxCountAt = 0 for i in range(len(countFields)): if (countFields[i] > maxCount): maxCount = countFields[i] maxCountAt = i if (i > 0) and (RectangularizationNeeded == False): if (countFields[i] != countFields[i - 1]): RectangularizationNeeded = True if (RectangularizationNeeded == True): print('Identified jagged data set; Rectangularization needed') else: print('Identified rectangular data set') print('Inferring longest row(s) has ', maxCount, ' fields at row ', maxCountAt) inpFileRe = inpFile.map(lambda s: removeEmbeddedCommas(s)).map( lambda s: s + ',No Data') # remove short rows shortFile = inpFileRe.filter( lambda row: len(row.split(',')) < maxCount + 1) print("Short rows will be filtered out") print('\n', shortFile.take(10), '\n') # truncate to maxCount+1 columns inpFileTr = inpFileRe.filter( lambda row: len(row.split(',')) == maxCount + 1) print('\n', inpFileTr.take(5), '\n') header = inpFileTr.first() hL = header.split(',') inpFileNh = inpFileTr.filter(lambda row: row != header) print('Removed the First row as Header') numRows = inpFileNh.count() print('number of rows = ', numRows) from pyspark.mllib.linalg import Matrix, Matrices from pyspark.mllib.linalg import Vector, Vectors # parsedData will be org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] parsedData = inpFileNh.map( lambda s: Vectors.dense([with0Str(t) for t in s.split(',')])) print('\nprint out a few vectors after converting from strings\n') print(parsedData.take(5)) from pyspark.mllib.stat import MultivariateStatisticalSummary, Statistics summary = Statistics.colStats(parsedData) print('\nprint out summary statistics, for each column\n') print('summary.mean') print(summary.mean()) print('summary.variance') print(summary.variance()) print('summary.count') print(summary.count()) print('summary.max') print(summary.max()) print('summary.min') print(summary.min()) print('summary.normL1') print(summary.normL1()) print('summary.normL2') print(summary.normL2()) print('summary.numnonZeros') print(summary.numNonzeros()) print() numCols = len(summary.mean()) typeStrings = [' '] * numCols # infer columns where normL1, normL2, mean, variance, max and mean are 0 as non-numeric print('Inferring column data types:') import math for j in range(numCols): if ((summary.normL1()[j] == 0.0) and (summary.normL2()[j] == 0.0) and (summary.mean()[j] == 0.0) and (summary.variance()[j] == 0.0) and (summary.max()[j] == 0.0) and (summary.min()[j] == 0.0)): typeStrings[j] = 'String' else: if ((math.trunc(summary.normL1()[j]) == summary.normL1()[j]) and (math.trunc(summary.max()[j]) == summary.max()[j]) and (math.trunc(summary.min()[j]) == summary.min()[j])): typeStrings[j] = 'Int' else: typeStrings[j] = 'Float' print(typeStrings[j], end=',') print('\n\n') #****************************************************************************** # take out the 'String' columns before calling Statistics.corr() numNumericCols = 0 for j in range(numCols): if (typeStrings[j] != 'String'): numNumericCols = numNumericCols + 1 noStrings = inpFileNh.map( lambda s: Vectors.dense(removeStrings(s, numNumericCols))) print(noStrings.take(5)) correlMatrix = Statistics.corr(noStrings, method='pearson') print('Computing Correlation Matrix on all columns') print( 'Printing out column names that have correlation coefficient > 0.5 or < -0.5' ) for i in range(numNumericCols): for j in range(i): if (((correlMatrix[i][j] >= 0.5) or (correlMatrix[i][j] <= -0.5)) and (i != j)): print(hA[i], hA[j], correlMatrix[i][j]) #****************************************************************************** #****************************************************************************** # create a contingency matrix LoLoF = [[0.0 for x in range(numNumericCols)] for y in range(numRows)] LoLoF = noStrings.collect() pdLinArr = [0.0 for x in range(numNumericCols * numRows)] for i in range(numRows): for j in range(numNumericCols): pdLinArr[i * numNumericCols + j] = abs(LoLoF[i][j]) mat = Matrices.dense(numRows, numNumericCols, pdLinArr) # conduct Pearson's independence test on the input contingency matrix print( "Computing Pearson's independence test on the input contingency matrix using chi-square test" ) independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom print('%s\n' % independenceTestResult) #******************************************************************************* stdDev = [0.0] * numCols for j in range(numCols): stdDev[j] = math.sqrt(summary.variance()[j]) #******************************************************************************* # test for normal distribution using Kolmogorov-Smirnov test # colVec = [0.0] * numRows #vecRDD = sc.parallelize(colVec) #testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', 0, 1) #print(testResult) numericMean = [0.0] * numNumericCols numericSD = [0.0] * numNumericCols k = 0 for j in range(numCols): if ((summary.mean()[j] != 0.0) and (summary.variance()[j] != 0.0)): numericMean[k] = summary.mean()[j] numericSD[k] = stdDev[j] k = k + 1 print( 'Checking if column data is normally distributed using Kolmogorov-Smirnov test' ) for j in range(numNumericCols): for i in range(numRows): # see https://issues.apache.org/jira/browse/SPARK-20802 # test fails if data is normally distributed # kolmogorovSmirnovTest in pyspark.mllib.stat.Statistics throws net.razorvine.pickle.PickleException # when input data is normally distributed (no error when data is not normally distributed) colVec[i] = float(i) # LoLoF[i][j] vecRDD = sc.parallelize(colVec) print(colVec[0], colVec[numRows - 1], numericMean[j], numericSD[j]) testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', numericMean[j], numericSD[j]) print(testResult) #******************************************************************************* #******************************************************************************* # # estimate kernel densities # from pyspark.mllib.stat import KernelDensity # colVec = [0.0]*numRows # vecRDD = sc.parallelize(colVec) print('Computing kernel densities on all columns using a Bandwidth of 3.0') kd = KernelDensity() kd.setSample(vecRDD) kd.setBandwidth(3.0) sAS = int(math.sqrt(numRows)) # sample array size samplePoints = [0.0] * sAS #samplePoints = [0.0]*numRows for i in range(sAS): samplePoints[i] = float(i * sAS) #for i in range(numRows): # samplePoints[i] = float(i) densities = kd.estimate(samplePoints) print('Estimating kernel densities') print('Print kernel densities at sample points') #print('Print kernel densities > 0.01 at sample points') for j in range(numNumericCols): # print( hL[j]) for i in range(numRows): # see https://issues.apache.org/jira/browse/SPARK-20803 # KernelDensity.estimate in pyspark.mllib.stat.KernelDensity throws # net.razorvine.pickle.PickleException when input data is normally # distributed (no error when data is not normally distributed) colVec[i] = float(i) # LoLoF[i][j] vecRDD = sc.parallelize(colVec) kd = KernelDensity() kd.setSample(vecRDD) kd.setBandwidth(3.0) # Find density estimates for the given values densities = kd.estimate(samplePoints) for i in range(sAS): print(densities[i], end=',') print() #for i in range(numRows): # if (densities[i] >= 0.01): # print(i, densities[i], end=',') print() #******************************************************************************* #******************************************************************************* # # compute Skewness and Kurtosis for each numeric column # skew = [0.0] * numNumericCols kurt = [0.0] * numNumericCols term = 0.0 k = 0 for j in range(numCols): if (typeStrings[j] != 'String'): skew[k] = 0.0 kurt[k] = 0.0 # extra work: find Ints typeStrings[j] = 'Int' meanj = summary.mean()[j] for i in range(numRows): if ((typeStrings[j] == 'Int') and (math.trunc(LoLoF[i][k]) != LoLoF[i][k])): typeStrings[j] = 'Float' term = (LoLoF[i][k] - meanj) / stdDev[j] skew[k] = skew[k] + (term * term * term) kurt[k] = kurt[k] + (term * term * term * term) skew[k] = skew[k] / numRows kurt[k] = (kurt[k] / numRows) - 3.0 k = k + 1 print('Skewness of columns') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(skew[k], end=',') k = k + 1 print() print('Kurtosis of columns') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(kurt[k], end=',') k = k + 1 print() print('Inferring column data types (Text string, Int, Float)') # numbers that are Int and non-negative and "large" are likely to be numeric labels -- keep checking this heuristic # columns that are outside Kurtosis limits <-1.2, 3.0> may be numeric labels print('Attempting to infer if an Int column is a numeric label') print("If all Ints in a column are >= 0 and 'large', it may be numLabel") print( 'If all Ints in a column are >= 0 and excess kurtosis is outside [-1.2, 3.0], it may be numLabel' ) for j in range(numCols): if ((typeStrings[j] == 'Int') and (summary.min()[j] >= 0) and ((summary.max()[j] > 10000) or (kurt[j] < -1.2) or (kurt[j] > 3.0))): print('column ' + j + ' (' + hA[j] + ') ' + ' may be a numeric label') typeStrings[j] = 'NumLabel' #****************************************************************************** #****************************************************************************** # # Normalize the dataset by shifting by mean and scaling by stdDev # normData = [[0.0 for x in range(numNumericCols)] for y in range(numRows)] rowMaxs = [0.0] * numRows rowMins = [0.0] * numRows rowNormL1s = [0.0] * numRows rowNormL2s = [0.0] * numRows rowNumZeros = [0] * numRows means = [0.0] * numCols for j in range(numCols): means[j] = summary.mean()[j] for i in range(numRows): rowMaxs[i] = -999999.0 rowMins[i] = 999999.0 rowNumZeros[i] = 0 rowNormL1s[i] = 0.0 rowNormL2s[i] = 0.0 k = 0 for j in range(numCols): if ((typeStrings[j] == 'Int') or (typeStrings[j] == 'Float')): normData[i][k] = (LoLoF[i][k] - means[j]) / stdDev[j] if (normData[i][k] > rowMaxs[i]): rowMaxs[i] = normData[i][k] if (normData[i][k] < rowMins[i]): rowMins[i] = normData[i][k] if (normData[i][k] == 0.0): rowNumZeros[i] = rowNumZeros if (abs(normData[i][k]) < 100.0): rowNormL1s[i] = rowNormL1s[i] + abs(normData[i][k]) rowNormL2s[ i] = rowNormL2s[i] + normData[i][k] * normData[i][k] # print(i,j,k, LoLoF[i][k], means[j], stdDev[j], normData[i][k], rowNormL1s[i], rowNormL2s[i]) k = k + 1 input = open(inputFileName, 'r') fileHandle = open('/home/bsrsharma/work/python/rowNormL1L2.csv', 'w') # Keep upto 6 columns of identifying info if (numCols > 1): for j in range(min(5, numCols)): fileHandle.write(hL[j]) fileHandle.write(',') fileHandle.write('L1-Norm') fileHandle.write(",") fileHandle.write('L2-Norm\n') s = input.readline() # don't repeat header for i in range(numRows): # copy input to output s = input.readline() LoS = s.split(',') for j in range(min(5, numCols)): fileHandle.write(LoS[j]) fileHandle.write(',') fileHandle.write('%s' % rowNormL1s[i]) fileHandle.write(',') fileHandle.write('%s' % math.sqrt(rowNormL2s[i])) fileHandle.write('\n') fileHandle.close() input.close() print('Wrote ', 'rowNormL1L2.csv') input = open(inputFileName, 'r') fileHandle = open(outputFileName, 'w') # output normalized data numCols = numCols - 1 # write header row if (numCols > 1): for j in range(numCols - 1): fileHandle.write(hL[j]) fileHandle.write(',') fileHandle.write(hL[numCols - 1]) fileHandle.write('\n') s = input.readline() # don't repeat header for i in range(numRows): # copy input to output s = input.readline() LoS = s.split(',') k = 0 for j in range(numCols - 1): if (typeStrings[j] == 'String'): fileHandle.write(LoS[j]) else: fileHandle.write('%s' % normData[i][k]) k = k + 1 fileHandle.write(',') if (typeStrings[numCols - 1] == 'String'): fileHandle.write(LoS[numCols - 1]) else: fileHandle.write('%s' % normData[i][k]) fileHandle.write('\n') fileHandle.close() input.close() print('Wrote ', outputFileName, '\n') #****************************************************************************** # compute median for each column medians = [0.0] * numNumericCols aCol = [0.0] * numRows for j in range(numNumericCols): for i in range(numRows): aCol[i] = LoLoF[i][j] aCol.sort() medians[j] = aCol[numRows / 2] print('medians:') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(medians[k], end=',') k = k + 1 print('\n\n') # compute histograms for each column numBins = int(math.sqrt(numRows)) histogram = [0] * (numBins + 1) binWidth = 0 mins = [0.0] * numCols maxs = [0.0] * numCols print('Computing histograms for numeric columns') print('choosing ', numBins, ' bins') k = 0 for j in range(numCols): mins[j] = summary.min()[j] maxs[j] = summary.max()[j] if (typeStrings[j] == 'String'): print('column ', j, '( ', hL[j], ' ): Text') else: binWidth = (maxs[j] - mins[j]) / numBins for i in range(numBins): histogram[i] = 0 for i in range(numRows): histogram[int((LoLoF[i][k] - mins[j]) / binWidth)] += 1 print('column ', j, '( ', hL[j], ' ):') if (typeStrings[j] == 'NumLabel'): print('NumLabel') for i in range(numBins): print(histogram[i], end=',') print() k = k + 1 print('\n\n') # compute modes modes = [0.0] * numNumericCols largestBin = 0 binIndex = 0 print('modes:') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: largestBin = 0 binIndex = 0 for i in range(numBins): # pick the bin with most items if (histogram[i] > largestBin): binIndex = i modes[k] = mins[j] + (maxs[j] - mins[j]) * binIndex / numBins print(modes[k], end=',') k = k + 1 print('\n\n') return 0
sc = SparkContext(conf=conf) #sentence = "hello hello world" #words = sentence.split() # Split sentence into a list of terms #tf = HashingTF(10000) # Create vectors of size S = 10,000 #tf.transform(words) #SparseVector(10000, {3065: 1.0, 6861: 2.0}) rdd = sc.wholeTextFiles("dracula.txt").map(lambda (name, text): text.split()) tf = HashingTF(10000) tfVectors = tf.transform(rdd).cache() # Compute the IDF, then the TF-IDF vectors idf = IDF(10000) idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) Statistics.colStats(tfIdfVectors) Statistics.corr(tfIdfVectors) Statistics.corr(tfIdfVectors,"spearman") Statistics.corr(tfVectors,tfIdfVectors) Statistics.chiSqTest(tfIdfVectors)
from pyspark import SparkContext # $example on$ from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % independenceTestResult) obs = sc.parallelize(
from pyspark.mllib.stat import Statistics mat = sc.parallelize( [np.array([10.1,12.4,14.5,16.8,21]),np.array([21.3,24.2,35.4,36.4,31.7]),np.array([21.1,23.,54.,65.,71.])] ) summary=Statistics.colStats(mat) summary.mean() summary.variance() summary.numNonzeros() X = sc.parallelize([10.1,12.4,14.5,16.8,21]) Y = sc.parallelize([21.3,24.2,35.4,36.4,31.7]) corr = Statistics.corr(X,Y,method='pearson') from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics vec = Vectors.dense(10.1,12.4,14.5,16.8,21,21.3,24.2,35.4,36.4,31.7) goodnestest = Statistics.chiSqTest(vec) print (goodnestest) data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None, sep=',') from pyspark.mllib.regression import LabeledPoint, LinearRegressionModel, LinearRegressionWithSGD sc.stop() sc = SparkContext(appName='MLAlgo') data = sc.textFile("C:\Users\Dell\Documents\winequality.csv") \ .map(lambda line: line.split(",")) \ .filter(lambda line: len(line)>1)\ .map(lambda line: (line[0],line[3],line[2]))\ .collect() print (data) parsed_data = [LabeledPoint(0.0,[14.23,1.71,2.43,15.6]), LabeledPoint(0.0,[13.2,1.78,2.14,11.2]), LabeledPoint(1.0,[21.3,32.4,3.5,21.4]), LabeledPoint(1.0,[12.4,21.4,21.7,32.8]),
from pyspark.mllib.linalg import Vectors def mapper(x): return 0,abs( float(x) ) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: spark-submit lr.py <input residuals file> <output file>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Chi Squared residuals") lines = sc.textFile(sys.argv[1], 1) resid = lines.map(mapper) resid = resid.collect() residuals = [] for r in resid: residuals.append(r[1]) vec = Vectors.dense(residuals) gft = Statistics.chiSqTest(vec) print("%s\n" % gft) sc.stop()
# now run chi-square def bin_data(true, pred, num_bins): min_val = np.array([true.min(), pred.min()]).min() max_val = np.array([true.max(), pred.max()]).max() true_binned = np.histogram(true, num_bins, (min_val, max_val))[0] pred_binned = np.histogram(pred, num_bins, (min_val, max_val))[0] return true_binned, pred_binned from pyspark.mllib.linalg import Vectors v, p = bin_data(valuesAndPreds_df['_1'].values, valuesAndPreds_df['_2'], 200) v = Vectors.dense(v) p = Vectors.dense(p) pearson = Statistics.chiSqTest(v, p) pearson.pValue pearson.statistic # ================================================================================ # Ridge regression # ================================================================================ # fit an ridge lr_ridge = LinearRegressionWithSGD.train(temp, 1000, .2, intercept=True, regType='l2') # evaluate the model on training data
] def parse_interaction_categorical(line): line_split = line.split(",") clean_line_split = line_split[6:41] attack = 1.0 if line_split[41] == 'normal.': attack = 0.0 return LabeledPoint(attack, np.array([float(x) for x in clean_line_split])) training_data_categorical = raw_data.map(parse_interaction_categorical) from pyspark.mllib.stat import Statistics chi = Statistics.chiSqTest(training_data_categorical) import pandas as pd pd.set_option('display.max_colwidth', 30) records = [(result.statistic, result.pValue) for result in chi] chi_df = pd.DataFrame(data=records, index=feature_names, columns=["Statistic", "p-value"]) print(chi_df) def parse_interaction_chi(line): line_split = line.split(",") # leave_out = [1,2,3,19,20.41] clean_line_split = line_split[0:1] + line_split[4:19] + line_split[21:41] attack = 1.0