def to_matrix(np_array): ''' Convert numpy array to MLlib Matrix ''' if len(np_array.shape) == 2: return Matrices.dense(np_array.shape[0], np_array.shape[1], np_array.ravel()) else: raise Exception("""An MLLib Matrix can only be created from a two-dimensional numpy array""")
def test_ml_mllib_matrix_conversion(self): # to ml # dense mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM2 = mllibDM.asML() self.assertEqual(mlDM2, mlDM1) # transposed mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt2 = mllibDMt.asML() self.assertEqual(mlDMt2, mlDMt1) # sparse mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM2 = mllibSM.asML() self.assertEqual(mlSM2, mlSM1) # transposed mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt2 = mllibSMt.asML() self.assertEqual(mlSMt2, mlSMt1) # from ml # dense mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) mllibDM2 = Matrices.fromML(mlDM) self.assertEqual(mllibDM1, mllibDM2) # transposed mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) mllibDMt2 = Matrices.fromML(mlDMt) self.assertEqual(mllibDMt1, mllibDMt2) # sparse mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mllibSM2 = Matrices.fromML(mlSM) self.assertEqual(mllibSM1, mllibSM2) # transposed mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mllibSMt2 = Matrices.fromML(mlSMt) self.assertEqual(mllibSMt1, mllibSMt2)
def g(block): i, j = block[0] mat = block[1].toArray() n, m = mat.shape col0 = colsPerBlock.value * j blockCenter = cb.value if np.isscalar( cb.value) else cb.value[col0:(col0 + m)] blockScale = sb.value if np.isscalar( sb.value) else sb.value[col0:(col0 + m)] newmat = (mat - blockCenter) / blockScale newmat = OldMatrices.dense(n, m, newmat.ravel(order='F')) return ((i, j), newmat)
def CreateInputs(input_case): data_file = '/u/vparames/TESTS/3/test-commute-dist-' + str( input_case) + '.mat' inp = loadmat(data_file) adj_mat = inp['G'] edge_list = inp['elist'] n = adj_mat.shape[0] sm = Matrices.dense(n, n, adj_mat.transpose().flatten()) adjacency_mat = BlockMatrix(sc.parallelize([((0, 0), sm)]), SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE) return adjacency_mat, edge_list
def test_matrix_independence(self): data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
def main(): if len(sys.argv) < 2: print('USAGE: matrix_mult.py <dim of matrix>') return n = int(sys.argv[1]) dm2 = Matrices.dense(n, n, np.random.randint(1, n * n, n * n).tolist()) blocks1 = sc.parallelize([((0,0), dm2)]) m2 = BlockMatrix(blocks1, n,n) m3 = BlockMatrix(blocks1, n,n) ret = m3.multiply(m2).toIndexedRowMatrix().toRowMatrix().rows.collect() print('****************n:', n)
def createAdjMatToy(graphNodes, year, sparseG, blockSize, sc): index = (year > 12) + 1 # 12 maps to 1, 16 maps to 2 path = 'BASE_PATH/toy_example/' A = loadmat(path + 'toy_A' + str(index) + '.mat')['G'] n = A.shape[0] p = n / 2 subMatrices = [A[:p,:p], A[:p,p:], A[p:,:p], A[p:,p:]] ids, blocks = [(0,0), (0,1), (1,0), (1,1)], [] for i, id in enumerate(ids): adj = subMatrices[i] G = Matrices.dense(p,p, adj.transpose().flatten()) blocks.append((id, G)) blocksRdd = sc.parallelize(blocks, len(ids)) return blocksRdd
def _colVectorToBlockMatrix(vec, rowsPerBlock, numSlices=None): sc = SparkContext.getOrCreate() remainder = len(vec) % rowsPerBlock if rowsPerBlock >= len(vec): splits = [vec] elif remainder == 0: splits = np.split(vec, len(vec) // rowsPerBlock) else: head = vec[:-remainder] splits = np.split(head, len(head) // rowsPerBlock) splits.append(vec[-remainder:]) blocks = sc.parallelize([((i, 0), OldMatrices.dense(len(split), 1, split)) for i, split in zip(range(len(splits)), splits)], numSlices=numSlices) return BlockMatrix(blocks, rowsPerBlock, 1, len(vec), 1)
def radialBasisBlock(pairData): I, J = int(pairData[0][0]), int(pairData[1][0]) dataI, dataJ = pairData[0][1], pairData[1][1] n = len(dataI) allCombinations = itertools.product(dataI, dataJ) allCombsEdges = [radialBasisKernel(p[0], p[1]) for p in allCombinations] print 'allCombsEdges ', len(allCombsEdges), (n*n) if len(allCombsEdges) == (n*n): adj = np.reshape(allCombsEdges, (n,n)) else: adj = np.zeros((n,n)) if I==J: adj[range(n), range(n)] = 0 G = Matrices.dense(n,n, adj.transpose().flatten()) return ((I,J), G)
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab("{}".format(targetDimension), testMeasure) rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result
def MapperLoadBlocksFromMatFile(filename): logging.warn('MapperLoadBlocksFromMatFile started %s ', filename) data = loadmat(filename) logging.warn('Loaded data') name = re.search('(\d+_\d+).mat$', filename, re.IGNORECASE).group(1) G = data[name] id = name.split('_') n = G.shape[0] logging.warn('Before sparse conversion') if (not (isinstance(G, sparse.csc_matrix))): sub_matrix = Matrices.dense(n, n, G.transpose().flatten()) else: #sub_matrix = Matrices.dense(n,n,np.array(G.todense()).transpose().flatten()) #SPARSE sub_matrix = Matrices.sparse(n, n, G.indptr, G.indices, G.data) logging.warn('MapperLoadBlocksFromMatFile Ended') return ((id[0], id[1]), sub_matrix)
def create_design_mat(tuple): twtname = tuple[0] word_tfs_str = tuple[1] i = 0 twt_word_ind_dict = {} for spl in word_tfs_str.split(inter_tweet_delim): word = spl.split(in_tweet_delim)[0] tfidf = float(spl.split(in_tweet_delim)[1].strip()) ind = word_index_dict_bcast.value[word] twt_word_ind_dict[ind] = tfidf design_row = [] while i < total_tweet_words_bcast.value: design_row.append(twt_word_ind_dict.get(i, float(0))) i = i + 1 sparse_des_row = Matrices.dense(1, total_tweet_words_bcast.value,design_row).toSparse() # return the float design matrix rw for tweetname return (twtname, sparse_des_row)
def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() if self._pandas_flag: pivot_table = pd.crosstab([self._data_frame[targetDimension]], self._data_frame[testDimension]) try: data_matrix = np.array( pivot_table.as_matrix(columns=None)).astype(np.int) except: data_matrix = np.array(pivot_table.values).astype(np.int) else: pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*list( zip(*pivot_table.drop(pivot_table.columns[0]).collect())))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) data_matrix = data_matrix.toArray().tolist() result = chi2_contingency(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result[0] n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def constructElectionBlock(pairDonations): I = int(pairDonations[0][0]) J = int(pairDonations[1][0]) donationsI = pairDonations[0][1] donationsJ = pairDonations[1][1] n = donationsI.shape[0] allCombinations = itertools.product(donationsI, donationsJ) allCombsEdges = [edgeDefinitionElection(p[0], p[1]) for p in allCombinations] if len(allCombsEdges) == (n*n): adj = np.reshape(allCombsEdges, (n,n)) else: adj = np.zeros((n,n)) if I==J: adj[range(n), range(n)] = 0 if GENERATE_SPARSE: G = sparse.csc_matrix(adj) subMatrixSparse = Matrices.sparse(n, n, G.indptr, G.indices, G.data) return ((I,J), subMatrixSparse) else: G = Matrices.dense(n,n, adj.transpose().flatten()) return ((I,J), G)
from pyspark.mllib.linalg import Vectors, Matrices from pyspark.feature import LabeledPoint vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] spark_vector = Vectors.dense(vector) label = 45.0 labeled_point = LabeledPoint(label, vector) spark_matrix = Matrices.dense(3, 2, vector)
if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % independenceTestResult) obs = sc.parallelize( [LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5])] ) # LabeledPoint(label, feature) # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Labelled point with a positive label and a dense feature vector. lp_pos = LabeledPoint(1.0, [5.0, 0.0, 1.0, 7.0]) # Labelled point with a negative label and a sparse feature vector. lp_neg = LabeledPoint(0.0, SparseVector(4, [0, 2, 3], [5.0, 1.0, 7.0])) # # Local Matrix # from pyspark.mllib.linalg import Matrix, Matrices # Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)) dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) # Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) # # Code Plan # # # 1- Combine all tweets files into a single data frame # 2- Parse the Tweets - remove stopwords - extract emoticons - extract url - normalize your words (e.g., mapping them to lowercase and removing punctuation and numbers) # 3- Feature extraction # 3a- Tokenisation # 3b- TF-IDF # 3c- Hash TF-IDF # 4- Run K-Means clustering
def findFeatures(inputFileName, outputFileName): inpFile = sc.textFile(inputFileName) numRows = inpFile.count() print('\nRead ', numRows, ' rows from ', inputFileName, '\n') print('Print out a few rows read from file') print('\n', inpFile.take(5), '\n') # Rectangularize the RDD before vectorizing # Filter elements to remove quotes to prevent (quote) embedded commas countFields = inpFile.map(lambda s: removeEmbeddedCommas(s)).map( lambda s: len(s.split(','))).collect() print('number of fields in each row (first few): ', countFields[0:4]) RectangularizationNeeded = False maxCount = 0 maxCountAt = 0 for i in range(len(countFields)): if (countFields[i] > maxCount): maxCount = countFields[i] maxCountAt = i if (i > 0) and (RectangularizationNeeded == False): if (countFields[i] != countFields[i - 1]): RectangularizationNeeded = True if (RectangularizationNeeded == True): print('Identified jagged data set; Rectangularization needed') else: print('Identified rectangular data set') print('Inferring longest row(s) has ', maxCount, ' fields at row ', maxCountAt) inpFileRe = inpFile.map(lambda s: removeEmbeddedCommas(s)).map( lambda s: s + ',No Data') # remove short rows shortFile = inpFileRe.filter( lambda row: len(row.split(',')) < maxCount + 1) print("Short rows will be filtered out") print('\n', shortFile.take(10), '\n') # truncate to maxCount+1 columns inpFileTr = inpFileRe.filter( lambda row: len(row.split(',')) == maxCount + 1) print('\n', inpFileTr.take(5), '\n') header = inpFileTr.first() hL = header.split(',') inpFileNh = inpFileTr.filter(lambda row: row != header) print('Removed the First row as Header') numRows = inpFileNh.count() print('number of rows = ', numRows) from pyspark.mllib.linalg import Matrix, Matrices from pyspark.mllib.linalg import Vector, Vectors # parsedData will be org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] parsedData = inpFileNh.map( lambda s: Vectors.dense([with0Str(t) for t in s.split(',')])) print('\nprint out a few vectors after converting from strings\n') print(parsedData.take(5)) from pyspark.mllib.stat import MultivariateStatisticalSummary, Statistics summary = Statistics.colStats(parsedData) print('\nprint out summary statistics, for each column\n') print('summary.mean') print(summary.mean()) print('summary.variance') print(summary.variance()) print('summary.count') print(summary.count()) print('summary.max') print(summary.max()) print('summary.min') print(summary.min()) print('summary.normL1') print(summary.normL1()) print('summary.normL2') print(summary.normL2()) print('summary.numnonZeros') print(summary.numNonzeros()) print() numCols = len(summary.mean()) typeStrings = [' '] * numCols # infer columns where normL1, normL2, mean, variance, max and mean are 0 as non-numeric print('Inferring column data types:') import math for j in range(numCols): if ((summary.normL1()[j] == 0.0) and (summary.normL2()[j] == 0.0) and (summary.mean()[j] == 0.0) and (summary.variance()[j] == 0.0) and (summary.max()[j] == 0.0) and (summary.min()[j] == 0.0)): typeStrings[j] = 'String' else: if ((math.trunc(summary.normL1()[j]) == summary.normL1()[j]) and (math.trunc(summary.max()[j]) == summary.max()[j]) and (math.trunc(summary.min()[j]) == summary.min()[j])): typeStrings[j] = 'Int' else: typeStrings[j] = 'Float' print(typeStrings[j], end=',') print('\n\n') #****************************************************************************** # take out the 'String' columns before calling Statistics.corr() numNumericCols = 0 for j in range(numCols): if (typeStrings[j] != 'String'): numNumericCols = numNumericCols + 1 noStrings = inpFileNh.map( lambda s: Vectors.dense(removeStrings(s, numNumericCols))) print(noStrings.take(5)) correlMatrix = Statistics.corr(noStrings, method='pearson') print('Computing Correlation Matrix on all columns') print( 'Printing out column names that have correlation coefficient > 0.5 or < -0.5' ) for i in range(numNumericCols): for j in range(i): if (((correlMatrix[i][j] >= 0.5) or (correlMatrix[i][j] <= -0.5)) and (i != j)): print(hA[i], hA[j], correlMatrix[i][j]) #****************************************************************************** #****************************************************************************** # create a contingency matrix LoLoF = [[0.0 for x in range(numNumericCols)] for y in range(numRows)] LoLoF = noStrings.collect() pdLinArr = [0.0 for x in range(numNumericCols * numRows)] for i in range(numRows): for j in range(numNumericCols): pdLinArr[i * numNumericCols + j] = abs(LoLoF[i][j]) mat = Matrices.dense(numRows, numNumericCols, pdLinArr) # conduct Pearson's independence test on the input contingency matrix print( "Computing Pearson's independence test on the input contingency matrix using chi-square test" ) independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom print('%s\n' % independenceTestResult) #******************************************************************************* stdDev = [0.0] * numCols for j in range(numCols): stdDev[j] = math.sqrt(summary.variance()[j]) #******************************************************************************* # test for normal distribution using Kolmogorov-Smirnov test # colVec = [0.0] * numRows #vecRDD = sc.parallelize(colVec) #testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', 0, 1) #print(testResult) numericMean = [0.0] * numNumericCols numericSD = [0.0] * numNumericCols k = 0 for j in range(numCols): if ((summary.mean()[j] != 0.0) and (summary.variance()[j] != 0.0)): numericMean[k] = summary.mean()[j] numericSD[k] = stdDev[j] k = k + 1 print( 'Checking if column data is normally distributed using Kolmogorov-Smirnov test' ) for j in range(numNumericCols): for i in range(numRows): # see https://issues.apache.org/jira/browse/SPARK-20802 # test fails if data is normally distributed # kolmogorovSmirnovTest in pyspark.mllib.stat.Statistics throws net.razorvine.pickle.PickleException # when input data is normally distributed (no error when data is not normally distributed) colVec[i] = float(i) # LoLoF[i][j] vecRDD = sc.parallelize(colVec) print(colVec[0], colVec[numRows - 1], numericMean[j], numericSD[j]) testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', numericMean[j], numericSD[j]) print(testResult) #******************************************************************************* #******************************************************************************* # # estimate kernel densities # from pyspark.mllib.stat import KernelDensity # colVec = [0.0]*numRows # vecRDD = sc.parallelize(colVec) print('Computing kernel densities on all columns using a Bandwidth of 3.0') kd = KernelDensity() kd.setSample(vecRDD) kd.setBandwidth(3.0) sAS = int(math.sqrt(numRows)) # sample array size samplePoints = [0.0] * sAS #samplePoints = [0.0]*numRows for i in range(sAS): samplePoints[i] = float(i * sAS) #for i in range(numRows): # samplePoints[i] = float(i) densities = kd.estimate(samplePoints) print('Estimating kernel densities') print('Print kernel densities at sample points') #print('Print kernel densities > 0.01 at sample points') for j in range(numNumericCols): # print( hL[j]) for i in range(numRows): # see https://issues.apache.org/jira/browse/SPARK-20803 # KernelDensity.estimate in pyspark.mllib.stat.KernelDensity throws # net.razorvine.pickle.PickleException when input data is normally # distributed (no error when data is not normally distributed) colVec[i] = float(i) # LoLoF[i][j] vecRDD = sc.parallelize(colVec) kd = KernelDensity() kd.setSample(vecRDD) kd.setBandwidth(3.0) # Find density estimates for the given values densities = kd.estimate(samplePoints) for i in range(sAS): print(densities[i], end=',') print() #for i in range(numRows): # if (densities[i] >= 0.01): # print(i, densities[i], end=',') print() #******************************************************************************* #******************************************************************************* # # compute Skewness and Kurtosis for each numeric column # skew = [0.0] * numNumericCols kurt = [0.0] * numNumericCols term = 0.0 k = 0 for j in range(numCols): if (typeStrings[j] != 'String'): skew[k] = 0.0 kurt[k] = 0.0 # extra work: find Ints typeStrings[j] = 'Int' meanj = summary.mean()[j] for i in range(numRows): if ((typeStrings[j] == 'Int') and (math.trunc(LoLoF[i][k]) != LoLoF[i][k])): typeStrings[j] = 'Float' term = (LoLoF[i][k] - meanj) / stdDev[j] skew[k] = skew[k] + (term * term * term) kurt[k] = kurt[k] + (term * term * term * term) skew[k] = skew[k] / numRows kurt[k] = (kurt[k] / numRows) - 3.0 k = k + 1 print('Skewness of columns') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(skew[k], end=',') k = k + 1 print() print('Kurtosis of columns') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(kurt[k], end=',') k = k + 1 print() print('Inferring column data types (Text string, Int, Float)') # numbers that are Int and non-negative and "large" are likely to be numeric labels -- keep checking this heuristic # columns that are outside Kurtosis limits <-1.2, 3.0> may be numeric labels print('Attempting to infer if an Int column is a numeric label') print("If all Ints in a column are >= 0 and 'large', it may be numLabel") print( 'If all Ints in a column are >= 0 and excess kurtosis is outside [-1.2, 3.0], it may be numLabel' ) for j in range(numCols): if ((typeStrings[j] == 'Int') and (summary.min()[j] >= 0) and ((summary.max()[j] > 10000) or (kurt[j] < -1.2) or (kurt[j] > 3.0))): print('column ' + j + ' (' + hA[j] + ') ' + ' may be a numeric label') typeStrings[j] = 'NumLabel' #****************************************************************************** #****************************************************************************** # # Normalize the dataset by shifting by mean and scaling by stdDev # normData = [[0.0 for x in range(numNumericCols)] for y in range(numRows)] rowMaxs = [0.0] * numRows rowMins = [0.0] * numRows rowNormL1s = [0.0] * numRows rowNormL2s = [0.0] * numRows rowNumZeros = [0] * numRows means = [0.0] * numCols for j in range(numCols): means[j] = summary.mean()[j] for i in range(numRows): rowMaxs[i] = -999999.0 rowMins[i] = 999999.0 rowNumZeros[i] = 0 rowNormL1s[i] = 0.0 rowNormL2s[i] = 0.0 k = 0 for j in range(numCols): if ((typeStrings[j] == 'Int') or (typeStrings[j] == 'Float')): normData[i][k] = (LoLoF[i][k] - means[j]) / stdDev[j] if (normData[i][k] > rowMaxs[i]): rowMaxs[i] = normData[i][k] if (normData[i][k] < rowMins[i]): rowMins[i] = normData[i][k] if (normData[i][k] == 0.0): rowNumZeros[i] = rowNumZeros if (abs(normData[i][k]) < 100.0): rowNormL1s[i] = rowNormL1s[i] + abs(normData[i][k]) rowNormL2s[ i] = rowNormL2s[i] + normData[i][k] * normData[i][k] # print(i,j,k, LoLoF[i][k], means[j], stdDev[j], normData[i][k], rowNormL1s[i], rowNormL2s[i]) k = k + 1 input = open(inputFileName, 'r') fileHandle = open('/home/bsrsharma/work/python/rowNormL1L2.csv', 'w') # Keep upto 6 columns of identifying info if (numCols > 1): for j in range(min(5, numCols)): fileHandle.write(hL[j]) fileHandle.write(',') fileHandle.write('L1-Norm') fileHandle.write(",") fileHandle.write('L2-Norm\n') s = input.readline() # don't repeat header for i in range(numRows): # copy input to output s = input.readline() LoS = s.split(',') for j in range(min(5, numCols)): fileHandle.write(LoS[j]) fileHandle.write(',') fileHandle.write('%s' % rowNormL1s[i]) fileHandle.write(',') fileHandle.write('%s' % math.sqrt(rowNormL2s[i])) fileHandle.write('\n') fileHandle.close() input.close() print('Wrote ', 'rowNormL1L2.csv') input = open(inputFileName, 'r') fileHandle = open(outputFileName, 'w') # output normalized data numCols = numCols - 1 # write header row if (numCols > 1): for j in range(numCols - 1): fileHandle.write(hL[j]) fileHandle.write(',') fileHandle.write(hL[numCols - 1]) fileHandle.write('\n') s = input.readline() # don't repeat header for i in range(numRows): # copy input to output s = input.readline() LoS = s.split(',') k = 0 for j in range(numCols - 1): if (typeStrings[j] == 'String'): fileHandle.write(LoS[j]) else: fileHandle.write('%s' % normData[i][k]) k = k + 1 fileHandle.write(',') if (typeStrings[numCols - 1] == 'String'): fileHandle.write(LoS[numCols - 1]) else: fileHandle.write('%s' % normData[i][k]) fileHandle.write('\n') fileHandle.close() input.close() print('Wrote ', outputFileName, '\n') #****************************************************************************** # compute median for each column medians = [0.0] * numNumericCols aCol = [0.0] * numRows for j in range(numNumericCols): for i in range(numRows): aCol[i] = LoLoF[i][j] aCol.sort() medians[j] = aCol[numRows / 2] print('medians:') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(medians[k], end=',') k = k + 1 print('\n\n') # compute histograms for each column numBins = int(math.sqrt(numRows)) histogram = [0] * (numBins + 1) binWidth = 0 mins = [0.0] * numCols maxs = [0.0] * numCols print('Computing histograms for numeric columns') print('choosing ', numBins, ' bins') k = 0 for j in range(numCols): mins[j] = summary.min()[j] maxs[j] = summary.max()[j] if (typeStrings[j] == 'String'): print('column ', j, '( ', hL[j], ' ): Text') else: binWidth = (maxs[j] - mins[j]) / numBins for i in range(numBins): histogram[i] = 0 for i in range(numRows): histogram[int((LoLoF[i][k] - mins[j]) / binWidth)] += 1 print('column ', j, '( ', hL[j], ' ):') if (typeStrings[j] == 'NumLabel'): print('NumLabel') for i in range(numBins): print(histogram[i], end=',') print() k = k + 1 print('\n\n') # compute modes modes = [0.0] * numNumericCols largestBin = 0 binIndex = 0 print('modes:') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: largestBin = 0 binIndex = 0 for i in range(numBins): # pick the bin with most items if (histogram[i] > largestBin): binIndex = i modes[k] = mins[j] + (maxs[j] - mins[j]) * binIndex / numBins print(modes[k], end=',') k = k + 1 print('\n\n') return 0
def eye(n): m = np.eye(n, n) m = Matrices.dense(n, n, m.flatten().tolist()) return m
from pyspark.sql import Row from pyspark.sql import SQLContext print("Successfully imported Spark Modules") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1) sc = SparkContext('local') sqlContext = SQLContext(sc) #############################################BASIC_DATA_TYPES########################################################### pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) print('*' * 50, 'BASIC_DATA_TYPES', '*' * 50) print(pos) print(neg) print(dm2) print(sm) ##############################################MODELS_TRAIN############################################################## def accuracy(data): y = [el[1] for el in data.collect()] y_pred = [el[0] for el in data.collect()] print('Accuracy:', accuracy_score(y, y_pred=y_pred))
print(summary.variance()) print(summary.numNonzeros()) print(summary.max()) print(summary.min()) print(summary.count()) print(summary.normL1()) print(summary.normL2()) #correlation x = sc.parallelize(np.random.randn(4, 1)) y = sc.parallelize(np.random.randn(4, 1)) print("Correlation :", str(Statistics.corr(x, y))) #Chi-square #For Vector x = Vectors.dense(np.random.random_sample((5))) y = Vectors.dense(np.random.random_sample((5))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis) # For Matrices x = Matrices.dense(4, 2, np.random.random_sample((8))) y = Matrices.dense(4, 2, np.random.random_sample((8))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis)
def test_from_matrix(): mat = Matrices.dense(1, 2, [13, 37]) x = from_matrix(mat) assert x.shape == (1, 2)
from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Create a labeled point with a positive label and a dense feature vector. pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) # 生成label 和 features 行 neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) from pyspark.mllib.linalg import Matrix, Matrices # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) dm2 = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6]) # 稀疏矩阵,尺寸,各个索引(数量,长度和尺寸对应),值 sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
#matrix = Matrices.dense(nrows, ncols, rdd) print("ncol: %d, nrow %d" % (ncols, nrows)) coord_mat = CoordinateMatrix(rdd.map(tuple)) print("num rows in matrix %d" % coord_mat.numRows()) print("finished using pyspark") #________________________________________________- print("now use SparkSession") from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx', header=False, inferSchema=True) df_2.printSchema() #coord_mat_2 = CoordinateMatrix(df_2.rdd.map(tuple)) row_mat = RowMatrix(df_2.rdd.map(tuple)) print("num rows in row matrix %d, num_cols %d" % (row_mat.numRows(), row_mat.numCols())) print("print covariance") print(row_mat.computeCovariance()) dm = Matrices.dense(3, 1, [4, 5, 6]) print("multiply row Matrix") result = row_mat.multiply(dm)
def denseBlock(x): adj = x[1].toarray() G_dm = Matrices.dense(block_size, block_size, adj.transpose().flatten()) return (x[0], G_dm)
def revised_simplex(h, f, b, m, n, basis, nonbasis): k = True B = f[:, basis] B = linalg.inv(B) Pxbb = B.dot(b).flatten() counter = 0 while k: counter = counter + 1 print counter cD = h[nonbasis] cB = h[basis] # print f # print basis B = f[:, basis] #print 'Basis of A with transpose' #print B.transpose() B = linalg.inv(B) #Pxbb = B.dot(b).flatten() #print 'inverse of B' #print B D = f[:, nonbasis] bs = Matrices.dense(m, 1, b.flatten().tolist()) blocks0 = sc.parallelize([((0, 0), bs)]) mat0 = BlockMatrix(blocks0, m, 1) dm1 = Matrices.dense(m, m, f[:, basis].flatten().tolist()) # A matrix basis indices chosen blocks1 = sc.parallelize([((0, 0), dm1)]) mat1 = BlockMatrix(blocks1, m, m) mat1 = mat1.transpose() mat1.toLocalMatrix() # print mat1.toLocalMatrix() mat2 = IndexedRowMatrix(sc.parallelize(enumerate(f[:, nonbasis]))).toBlockMatrix(rowsPerBlock=m, colsPerBlock=n) # print (mat2.toLocalMatrix()) G = mat1.toLocalMatrix() # G is basis stored K = mat2.toLocalMatrix() # print (G) # It will display Basis Matrix # print (K) dm2 = Matrices.dense(m, m, B.flatten().tolist()) # Inverse stored in dm2 blocks2 = sc.parallelize([((0, 0), dm2)]) # Inverse B converted to blocks mat3 = BlockMatrix(blocks2, m, m) mat3 = mat3.transpose() L = mat3.toLocalMatrix() dm3 = Matrices.dense(1, m, h[basis].tolist()) # Cost vector C, basis stored in dm3 blocks4 = sc.parallelize([((0, 0), dm3)]) # 'c' basis is stored in blocks4 mat4 = BlockMatrix(blocks4, 1, m) # 'c' stored as BlockMatrix S = mat4.toLocalMatrix() # print (S) dm4 = Matrices.dense(1, n, h[nonbasis].tolist()) # Cost vector C, non-basis stored in dm5 blocks5 = sc.parallelize([((0, 0), dm4)]) # 'c' non-basis is stored in blocks5 mat6 = BlockMatrix(blocks5, 1, n) # 'c' stored as BlockMatrix R = mat6.toLocalMatrix() # print (R) La = mat4.multiply(mat3).toLocalMatrix() # c is basis matrix, multiply by matrix B inverse. In main program it is "l = cB.dot(B)" # print (La) blocks6 = sc.parallelize([((0, 0), La)]) # this step is done to store La in mat variable so that it would be easy to use it for further multiplication mat7 = BlockMatrix(blocks6, 1, m) # from main program "l = cB.dot(B)" is stored in "mat 7" for future multiplication Sa = mat7.toLocalMatrix() # print (Sa) ga = mat7.multiply( mat2).toLocalMatrix() # multiply "l = cB.dot(B)" by 'D' where 'D' is Matrix A's non basis. Here 'mat3' # print (ga) blocks7 = sc.parallelize([((0, 0), ga)]) # this step is done to store 'ga' in mat8 mat8 = BlockMatrix(blocks7, 1, n) Cd = mat6.subtract(mat8).toLocalMatrix() #print 'mat6=' #print mat6.toLocalMatrix() #print 'mat7' #print mat7.toLocalMatrix() #print 'mat2' #print mat2.transpose().toLocalMatrix() #print 'mat4' #print mat4.toLocalMatrix() #print 'mat3' #print mat3.toLocalMatrix() #print 'mat8=' #print mat8.toLocalMatrix() ma = Cd.toArray() # maa = np.around(ma, decimals= 10) print 'ma =' print ma # print "printing Cd" minrD = np.argmin(ma) #print 'minimum index of maa is' print (minrD) do = minrD # We get value 0 Dxx = D[:, do] Dx = Matrices.dense(m, 1, Dxx.tolist()) # the index of minimum of minrD is used to call matrix D's elements which we will parallelize blocks8 = sc.parallelize([((0, 0), Dx)]) # store Dx it in blocks8 mat9 = BlockMatrix(blocks8, m, 1) # Convert to blockmatrix and store in mat9 Aa = mat9.toLocalMatrix() Pa = mat3.multiply( mat9).toLocalMatrix() # Inverse of B multiply by Dx( where Dx = D[:, n] where D = A[:, nonbasis] Pxb = mat3.multiply(mat0).toLocalMatrix() #print (Pa) #print (Pxb) Paa = B.dot(Dxx) # Pxbb = B.dot(b) # Paaa = np.around(Paa, decimals= 16) # Pxbbb = np.around(Pxbb, decimals=16) print 'This is Paa' print Paa # abc = np.divide(Pxbb, Paa) # print (abc) # with np.errstate(divide='ignore'): abc = inf * np.ones(len(Pxbb)) abcd = inf * np.ones(len(Pxbb)) # print 'len(Paa) is' # print len(Paa) - 1 for idx in range(0, len(Paa)): # print idx if Paa[idx] > 1e-12: abc[idx] = Pxbb[idx] / Paa[idx] print 'this is Pxbb before update ' print Pxbb Qa = np.argmin(abc) #Qa = np.argmin(abc[np.nonzero(abc)]) Pxbb = Pxbb - np.multiply(np.amin(abc), Paa).transpose() print np.multiply(np.amin(abc), Paa) Pxbb[Qa] = np.amin(abc) #for idx in range(0, len(Paa)): #if Paa[idx] > 0: #abcd[idx] = Pxbb[idx] / Paa[idx] print 'this is Paa after update' print Paa print 'this is Pxbb after updating' print Pxbb print 'abc with updated Pxbb' print abc #Qc = np.argmin(abcd[np.nonzero(abcd)]) #print 'do = The leaving variable index' #print do #print 'np.argmin(abc) is the entering variable index' #print Qa #print 'printing nonbasis do' #print nonbasis[do] object = h[basis] print 'printing Qa=' print Qa final = basis k = np.any(ma < -0.00000000001) if k == False: break temp = basis[Qa] basis[Qa] = nonbasis[do] nonbasis[do] = temp #print 'Cd =' #print (Cd) print 'nonbasis =' print nonbasis print 'basis =' print basis # print shape(basis) #ma = Cd.toArray() #print 'ma =' #print ma # print k # print 'Pxbb =' #print type(Pxbb) zzz = np.inner(h[basis], Pxbb) solution = [zzz, basis, Pxbb] return solution
if __name__ == "__main__": global rows global mat rows = 3 sc = SparkContext("local", "Determinant") # accumulator variable to accumulate final determinant value det = sc.accumulator(0) # dense matrix returns matrix in column major format hence # the entered values itself is fiven in column major so that # we can finally have a row major matrix to operate on dm2 = Matrices.dense(rows, rows, [2, 7, 3, 3, 7, 8, 5, 8, 5]) print "\n\nEntered matrix:\n", dm2.toArray() #here we are trying to divide work between workers. we divide first row # between them (calculate partial determinant for each item in first row) cols = sc.parallelize([i for i in range(0, rows)]) mat = dm2.toArray() cols.foreach(dist_deter) # print "determinant", determinant(3,dm2) #to check correctness print "\n The determinant is:", det.value
fractions = {1: 0.1, 2: 0.6, 3: 0.3} approxSample = data.sampleByKey(False, fractions) ## hypothesis testing from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # compute goodness of fit. either compare two vectors to each other or compare one vector to a uniform distribution goodnessOfFitTestResults = Statistics.chiSqTest(vec) print(goodnessOfFitTestResults) # pearson's independence test on a matrix mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) independenceTestResults = Statistics.chiSqTest(mat) print(independenceTestResults) # a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns # a Chi-squared test results for every feature against the label obs = sc.parallelize([ LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5]) ]) featureTestResults = Statistics.chiSqTest(obs) for i, result in enumerate(featureTestResults): print('column {0}: \n {1}'.format(i, result))
def npToDenseMat(ndArr): m, n = ndArr.shape return Matrices.dense(m, n, ndArr.transpose().flatten())
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() if self._pandas_flag: if self._data_frame[testMeasure].dtypes == 'int64': measureSummaryDict = dict( self._data_frame[testMeasure].describe()) if float(measureSummaryDict["count"]) > 10: maxval = int(measureSummaryDict["max"]) minval = int(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ round(math.floor(minval)), round(minval + step), round(minval + (step * 2)), round(minval + (step * 3)), round(minval + (step * 4)), round(math.ceil(maxval)) ] splits = list(set(splits)) splits.sort() self._data_frame['bucketedColumn'] = pd.cut( self._data_frame[testMeasure], bins=splits, labels=list(range(len(splits) - 1)), retbins=True, right=False)[0] self._data_frame = self._data_frame.dropna() pivot_table = pd.crosstab( [self._data_frame[targetDimension]], self._data_frame['bucketedColumn']) else: pivot_table = pd.crosstab( [self._data_frame[targetDimension]], self._data_frame[testMeasure]) else: df = self._data_frame if [df[testMeasure].dtypes == 'float64']: measureSummaryDict = dict(df[testMeasure].describe()) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] df['bucketedColumn'] = pd.cut( df[testMeasure], bins=splits, labels=list(range(len(splits) - 1)), retbins=True, right=False)[0] df = df.dropna() pivot_table = pd.crosstab([df[targetDimension]], df['bucketedColumn']) else: pivot_table = pd.crosstab([df[targetDimension]], df[testMeasure]) else: dtype = self._data_frame.schema[testMeasure].dataType if dtype is IntegerType(): # df = self._data_frame.withColumn(testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict( self._data_frame.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = int(measureSummaryDict["max"]) minval = int(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ round(math.floor(minval)), round(minval + step), round(minval + (step * 2)), round(minval + (step * 3)), round(minval + (step * 4)), round(math.ceil(maxval)) ] splits = list(set(splits)) splits.sort() bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform( self._data_frame.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') keshav = pivot_table.toPandas() else: pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testMeasure) else: df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict( df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform( df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab( "{}".format(targetDimension), testMeasure) if self._pandas_flag: try: data_matrix = np.array( pivot_table.as_matrix(columns=None)).astype(np.int) except: data_matrix = np.array(pivot_table.values).astype(np.int) else: rdd = list( chain(*list( zip(*pivot_table.drop(pivot_table.columns[0]).collect())))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) data_matrix = data_matrix.toArray().tolist() result = chi2_contingency(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result[0] n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result
if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense( 3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % independenceTestResult) obs = sc.parallelize([ LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5]) ]) # LabeledPoint(label, feature) # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
from pyspark.mllib.linalg import Vectors, Matrices from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics sc = SparkContext("local", "Rubbish") """ # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) """ # Sample vector composing of frequency of events vect = Vectors.dense([4, 5, 0, 3]) # Summary of the test including the p-value, degrees of freedom, goodnessOfFitTestResult = Statistics.chiSqTest(vect) sampleData = [ 40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0 ] matrix = Matrices.dense(3, 4, sampleData) # Conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(matrix) # Test statistic, the method used, and the null hypothesis. print "SINGLE VECTOR FIT: " print goodnessOfFitTestResult ## Summary of the test including the p-value, degrees of freedom. print "INDEPENDENCE TEST RESULT: " print independenceTestResult
from __future__ import print_function #Section 7.2.1 from pyspark.mllib.linalg import Vectors, Vector dv1 = Vectors.dense(5.0,6.0,7.0,8.0) dv2 = Vectors.dense([5.0,6.0,7.0,8.0]) sv = Vectors.sparse(4, [0,1,2,3], [5.0,6.0,7.0,8.0]) dv2[2] dv1.size dv2.toArray() from pyspark.mllib.linalg import Matrices dm = Matrices.dense(2,3,[5.0,0.0,0.0,3.0,1.0,4.0]) sm = Matrices.sparse(2,3,[0,1,2,4], [0,1,0,1], [5.0,3.0,1.0,4.0]) sm.toDense() dm.toSparse() dm[1,1] #Section 7.2.2 from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow rmind = IndexedRowMatrix(rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0]))) #Section 7.4 housingLines = sc.textFile("first-edition/ch07/housing.data", 6) housingVals = housingLines.map(lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")])) #Section 7.4.1 from pyspark.mllib.linalg.distributed import RowMatrix housingMat = RowMatrix(housingVals) from pyspark.mllib.stat._statistics import Statistics
# coding=utf-8 from pyspark.mllib.linalg import Vectors, SparseVector, Matrix, Matrices # 本地矩阵 dm = Matrices.dense(2, 2, [2, 3, 4, 5]) sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) print('dm') print(dm.toArray()) print('sm') print(sm.toDense())
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics sc = SparkContext("local", "Rubbish") """ # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) """ # Sample vector composing of frequency of events vect = Vectors.dense([4,5,0,3]) # Summary of the test including the p-value, degrees of freedom, goodnessOfFitTestResult = Statistics.chiSqTest(vect) sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] matrix = Matrices.dense(3,4, sampleData) # Conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(matrix) # Test statistic, the method used, and the null hypothesis. print "SINGLE VECTOR FIT: " print goodnessOfFitTestResult ## Summary of the test including the p-value, degrees of freedom. print "INDEPENDENCE TEST RESULT: " print independenceTestResult
model_test_dfi = pipeline.fit(test_dfi) result_test_dfi = model.transform(test_dfi) #cv = CountVectorizer(inputCol="stopRemove", outputCol="features") #model = cv.fit(dataset) #result = model.transform(dataset) #result.show(truncate=False) array = [0]*20 print(array) mvv = result.select("movie_name").rdd.flatMap(lambda x: x).collect() print(mvv) from pyspark.mllib.linalg import Matrix, Matrices # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) dm2 = Matrices.dense(3, 4, [1, 2, 3, 4, 5, 6,7,8,9,10,11,12]) dm3 = Matrices.dense(1, 12, [1, 2, 3, 4, 5, 6,7,8,9,10,11,12]) print(dm2) print(dm3) #dm4 = dm2+dm3 #print(dm4) from pyspark.sql.functions import col mapping = {} #train_data, test_data = result.randomSplit([1.0,0.0], seed=100) train_data = result train_data.printSchema() #df_new = mapping_df.rename(columns={'_c0': 'A'}) mapping_df1 = mapping_df.select(col("_c0").alias("indexes"),col("0").alias("genres")) mapping_df1.show() #print(range(mapping_df1.collect()))