def test_matrix_independence(self): data = [ 40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0 ] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)
def loadBlockFromMatFile(filename): data = loadmat(filename, squeeze_me=True) id, G = data['block_id'], data['G'] if isinstance(G, sparse.csc_matrix): sub_matrix = Matrices.sparse(p, p, G.indptr, G.indices, G.data) else: sub_matrix = Matrices.dense(p, p, G.transpose().flatten()) return ((id[0], id[1]), sub_matrix)
def test_computeRowSums(self): dm1 = OldMatrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) dm2 = OldMatrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) dm3 = OldMatrices.dense(3, 2, [13, 14, 15, 16, 17, 18]) dm4 = OldMatrices.dense(3, 2, [19, 20, 21, 22, 23, 24]) blocks = self.sc.parallelize([((0, 0), dm1), ((0, 1), dm2), ((1, 0), dm3), ((1, 1), dm4)]) mat = BlockMatrix(blocks, 3, 2) rowSums = sparkle.util._computeRowSums(mat) self.assertTrue(np.all(rowSums == [48, 66, 84, 102]))
def g(block): blockArr = block[1].toArray().ravel() newmat = OldMatrices.dense( block[1].numRows, block[1].numCols, np.sign(blockArr) * np.maximum(0, np.abs(blockArr) - rho)) return (block[0], newmat)
def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def mult(A, B): #-------LOG logging.warn("Multiplication started") blockcount = A.blocks.getNumPartitions() logging.warn("A part count") logging.warn(blockcount) blockcount = B.blocks.getNumPartitions() logging.warn("B part count") logging.warn(blockcount) #-----LOG # If dense, just call the inbuilt function. if (isinstance(A.blocks.first()[1], DenseMatrix) or isinstance(B.blocks.first()[1], DenseMatrix)): return A.multiply(B) #sparse ? Then continue the madness N = A.numRows() p = SQUARE_BLOCK_SIZE num_blocks = N / p aleft = A.blocks.flatMap(lambda x: affectLeft(x, num_blocks)) bright = B.blocks.flatMap(lambda x: affectRight(x, num_blocks)) both = aleft.union(bright) indi = both.reduceByKey(lambda a, b: prod(a, b)) map = indi.map(lambda x: ((x[0][0], x[0][2]), x[1])) pr = map.reduceByKey(add) brd = pr.map(lambda x: ((x[0][0], x[0][ 1]), Matrices.sparse(p, p, x[1].indptr, x[1].indices, x[1].data))) C = BlockMatrix(brd, p, p, N, N) return C
def dist_deter(i): global mat global rows global det # sum = 0 matrix = [] # mat = mat.toArray() for j in range(0, rows): for k in range(1, rows): if j != i: # z=0 # print mat[k][j] matrix.append(mat[k][j]) # print matrix, rows matrix = Matrices.dense(rows - 1, rows - 1, matrix) # print "siudfoi",i,matrix.toArray() # print "--------------------------" # print "mat[0,i] =",mat[0][i] # print "deter =",determinant(rows-1,matrix) sum = mat[0][i] * determinant(rows - 1, matrix) if i % 2 == 0: det += sum else: det += (0 - sum)
def determinant(rows, mat): if rows <= 0: return "invalid" else: if rows == 1: return mat[0] elif rows == 2: mat = mat.toArray() # print "dsjfgbi", mat return mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1] else: sum = 0 mat = mat.toArray() for i in range(0, rows): matrix = [] for j in range(0, rows): for k in range(1, rows): if j != i: matrix.append(mat[k][j]) # print matrix, rows matrix = Matrices.dense(rows - 1, rows - 1, matrix) # print "siudfoi",i,matrix.toArray() # print "--------------------------" # print "mat[0,i] =",mat[0][i] # print "deter =",determinant(rows-1,matrix) if i % 2 == 0: sum = sum + mat[0][i] * determinant(rows - 1, matrix) else: sum = sum - mat[0][i] * determinant(rows - 1, matrix) # print sum return sum
def to_matrix(np_array): ''' Convert numpy array to MLlib Matrix ''' if len(np_array.shape) == 2: return Matrices.dense(np_array.shape[0], np_array.shape[1], np_array.ravel()) else: raise Exception("""An MLLib Matrix can only be created from a two-dimensional numpy array""")
def MapperLoadBlocksFromMatFile(filename): logging.warn('MapperLoadBlocksFromMatFile started %s ', filename) data = loadmat(filename) logging.warn('Loaded data') name = re.search('(\d+_\d+).mat$', filename, re.IGNORECASE).group(1) G = data[name] id = name.split('_') n = G.shape[0] logging.warn('Before sparse conversion') if (not (isinstance(G, sparse.csc_matrix))): sub_matrix = Matrices.dense(n, n, G.transpose().flatten()) else: #sub_matrix = Matrices.dense(n,n,np.array(G.todense()).transpose().flatten()) #SPARSE sub_matrix = Matrices.sparse(n, n, G.indptr, G.indices, G.data) logging.warn('MapperLoadBlocksFromMatFile Ended') return ((id[0], id[1]), sub_matrix)
def difun(x, vect): if x[0] == x[1]: sm = SparseMatrix(p, p, np.linspace(0, p, num = (p+1)), \ np.linspace(0, p-1, num = p), vect[(x[0]*p):((x[0]+1)*p)]) return (x, sm) else: h = sparse.csc_matrix((p, p)) return (x, Matrices.sparse(p, p, h.indptr, h.indices, h.data))
def to_matrix(np_array): if len(np_array.shape) == 2: return Matrices.dense(np_array.shape[0], np_array.shape[1], np_array.ravel()) else: raise Exception( 'An MLLib Matrix can only be created from a two-dimensional numpy array' )
def normalizeLaplacian(block, d1): I, J = block[0] mat, p = block[1].toArray(), SQUARE_BLOCK_SIZE L = np.zeros((p, p)) for i in range(p): for j in range(p): L[i, j] = mat[i, j] * d1[I * p + i] * d1[J * p + j] nomalizedL = Matrices.dense(p, p, L.transpose().flatten()) return (block[0], nomalizedL)
def to_matrix(np_array): """Convert numpy array to MLlib Matrix """ if len(np_array.shape) == 2: return Matrices.dense(np_array.shape[0], np_array.shape[1], np_array.ravel()) else: raise Exception( "An MLLib Matrix can only be created from a two-dimensional " + "numpy array, got {}".format(len(np_array.shape)))
def main(): if len(sys.argv) < 2: print('USAGE: matrix_mult.py <dim of matrix>') return n = int(sys.argv[1]) dm2 = Matrices.dense(n, n, np.random.randint(1, n * n, n * n).tolist()) blocks1 = sc.parallelize([((0, 0), dm2)]) m2 = BlockMatrix(blocks1, n, n) m3 = BlockMatrix(blocks1, n, n) ret = m3.multiply(m2).toIndexedRowMatrix().toRowMatrix().rows.collect() print('****************n:', n)
def convertMahoutToSparkMatrix(mahoutMatrix): """ For compatible use :param mahoutMatrix: :return: """ rows, cols = mahoutMatrix.shape # remember to take the transpose since denseMatrix is column major ret = Matrices.dense(rows, cols, mahoutMatrix.transpose().flatten().tolist()) return ret
def g(block): i, j = block[0] mat = block[1].toArray() n, m = mat.shape col0 = colsPerBlock.value * j blockCenter = cb.value if np.isscalar( cb.value) else cb.value[col0:(col0 + m)] blockScale = sb.value if np.isscalar( sb.value) else sb.value[col0:(col0 + m)] newmat = (mat - blockCenter) / blockScale newmat = OldMatrices.dense(n, m, newmat.ravel(order='F')) return ((i, j), newmat)
def test_matrix_independence(self): data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
def eucledianDistances(ZblockPair): I, J = int(ZblockPair[0][0]), int(ZblockPair[1][0]) blockI, blockJ = ZblockPair[0][1], ZblockPair[1][1] allCombinations = itertools.product(blockI, blockJ) allCombsEdges = [np.linalg.norm(p[0] - p[1]) for p in allCombinations] n = blockI.shape[0] if len(allCombsEdges) == (n * n): adj = np.reshape(allCombsEdges, (n, n)) else: adj = np.zeros((n, n)) G = Matrices.dense(n, n, adj.transpose().flatten()) return ((I, J), G)
def CreateInputs(input_case): data_file = '/u/vparames/TESTS/3/test-commute-dist-' + str( input_case) + '.mat' inp = loadmat(data_file) adj_mat = inp['G'] edge_list = inp['elist'] n = adj_mat.shape[0] sm = Matrices.dense(n, n, adj_mat.transpose().flatten()) adjacency_mat = BlockMatrix(sc.parallelize([((0, 0), sm)]), SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE) return adjacency_mat, edge_list
def difun(x, vect): if (x[0] == x[1]): sm = SparseMatrix( SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, np.linspace(0, SQUARE_BLOCK_SIZE, num=(SQUARE_BLOCK_SIZE + 1)), np.linspace(0, SQUARE_BLOCK_SIZE - 1, num=SQUARE_BLOCK_SIZE), vect[(x[0] * SQUARE_BLOCK_SIZE):((x[0] + 1) * SQUARE_BLOCK_SIZE)]) return (x, sm) else: h = sparse.csc_matrix((SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE)) return (x, Matrices.sparse(SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, h.indptr, h.indices, h.data))
def main(): if len(sys.argv) < 2: print('USAGE: matrix_mult.py <dim of matrix>') return n = int(sys.argv[1]) dm2 = Matrices.dense(n, n, np.random.randint(1, n * n, n * n).tolist()) blocks1 = sc.parallelize([((0,0), dm2)]) m2 = BlockMatrix(blocks1, n,n) m3 = BlockMatrix(blocks1, n,n) ret = m3.multiply(m2).toIndexedRowMatrix().toRowMatrix().rows.collect() print('****************n:', n)
def difun(self, x, vect): squareBlockSize = copy.deepcopy(self.squareBlockSize) if (x[0] == x[1]): sm = SparseMatrix( squareBlockSize, squareBlockSize, np.linspace(0, squareBlockSize, num=(squareBlockSize + 1)), np.linspace(0, squareBlockSize - 1, num=squareBlockSize), vect[(x[0] * squareBlockSize):((x[0] + 1) * squareBlockSize)]) return (x, sm) else: h = sparse.csc_matrix((squareBlockSize, squareBlockSize)) return (x, Matrices.sparse(squareBlockSize, squareBlockSize, h.indptr, h.indices, h.data))
def constructElectionBlock(pairDonations): I = int(pairDonations[0][0]) J = int(pairDonations[1][0]) donationsI = pairDonations[0][1] donationsJ = pairDonations[1][1] n = donationsI.shape[0] allCombinations = itertools.product(donationsI, donationsJ) allCombsEdges = [edgeDefinitionElection(p[0], p[1]) for p in allCombinations] if len(allCombsEdges) == (n*n): adj = np.reshape(allCombsEdges, (n,n)) else: adj = np.zeros((n,n)) if I==J: adj[range(n), range(n)] = 0 if GENERATE_SPARSE: G = sparse.csc_matrix(adj) subMatrixSparse = Matrices.sparse(n, n, G.indptr, G.indices, G.data) return ((I,J), subMatrixSparse) else: G = Matrices.dense(n,n, adj.transpose().flatten()) return ((I,J), G)
def createAdjMatToy(graphNodes, year, sparseG, blockSize, sc): index = (year > 12) + 1 # 12 maps to 1, 16 maps to 2 path = 'BASE_PATH/toy_example/' A = loadmat(path + 'toy_A' + str(index) + '.mat')['G'] n = A.shape[0] p = n / 2 subMatrices = [A[:p,:p], A[:p,p:], A[p:,:p], A[p:,p:]] ids, blocks = [(0,0), (0,1), (1,0), (1,1)], [] for i, id in enumerate(ids): adj = subMatrices[i] G = Matrices.dense(p,p, adj.transpose().flatten()) blocks.append((id, G)) blocksRdd = sc.parallelize(blocks, len(ids)) return blocksRdd
def radialBasisBlock(pairData): I, J = int(pairData[0][0]), int(pairData[1][0]) dataI, dataJ = pairData[0][1], pairData[1][1] n = len(dataI) allCombinations = itertools.product(dataI, dataJ) allCombsEdges = [radialBasisKernel(p[0], p[1]) for p in allCombinations] print 'allCombsEdges ', len(allCombsEdges), (n*n) if len(allCombsEdges) == (n*n): adj = np.reshape(allCombsEdges, (n,n)) else: adj = np.zeros((n,n)) if I==J: adj[range(n), range(n)] = 0 G = Matrices.dense(n,n, adj.transpose().flatten()) return ((I,J), G)
def _colVectorToBlockMatrix(vec, rowsPerBlock, numSlices=None): sc = SparkContext.getOrCreate() remainder = len(vec) % rowsPerBlock if rowsPerBlock >= len(vec): splits = [vec] elif remainder == 0: splits = np.split(vec, len(vec) // rowsPerBlock) else: head = vec[:-remainder] splits = np.split(head, len(head) // rowsPerBlock) splits.append(vec[-remainder:]) blocks = sc.parallelize([((i, 0), OldMatrices.dense(len(split), 1, split)) for i, split in zip(range(len(splits)), splits)], numSlices=numSlices) return BlockMatrix(blocks, rowsPerBlock, 1, len(vec), 1)
def test_ml_mllib_matrix_conversion(self): # to ml # dense mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM2 = mllibDM.asML() self.assertEqual(mlDM2, mlDM1) # transposed mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt2 = mllibDMt.asML() self.assertEqual(mlDMt2, mlDMt1) # sparse mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM2 = mllibSM.asML() self.assertEqual(mlSM2, mlSM1) # transposed mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt2 = mllibSMt.asML() self.assertEqual(mlSMt2, mlSMt1) # from ml # dense mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) mllibDM2 = Matrices.fromML(mlDM) self.assertEqual(mllibDM1, mllibDM2) # transposed mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) mllibDMt2 = Matrices.fromML(mlDMt) self.assertEqual(mllibDMt1, mllibDMt2) # sparse mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mllibSM2 = Matrices.fromML(mlSM) self.assertEqual(mllibSM1, mllibSM2) # transposed mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mllibSMt2 = Matrices.fromML(mlSMt) self.assertEqual(mllibSMt1, mllibSMt2)
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab("{}".format(targetDimension), testMeasure) rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result
def create_design_mat(tuple): twtname = tuple[0] word_tfs_str = tuple[1] i = 0 twt_word_ind_dict = {} for spl in word_tfs_str.split(inter_tweet_delim): word = spl.split(in_tweet_delim)[0] tfidf = float(spl.split(in_tweet_delim)[1].strip()) ind = word_index_dict_bcast.value[word] twt_word_ind_dict[ind] = tfidf design_row = [] while i < total_tweet_words_bcast.value: design_row.append(twt_word_ind_dict.get(i, float(0))) i = i + 1 sparse_des_row = Matrices.dense(1, total_tweet_words_bcast.value,design_row).toSparse() # return the float design matrix rw for tweetname return (twtname, sparse_des_row)
def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() if self._pandas_flag: pivot_table = pd.crosstab([self._data_frame[targetDimension]], self._data_frame[testDimension]) try: data_matrix = np.array( pivot_table.as_matrix(columns=None)).astype(np.int) except: data_matrix = np.array(pivot_table.values).astype(np.int) else: pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*list( zip(*pivot_table.drop(pivot_table.columns[0]).collect())))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) data_matrix = data_matrix.toArray().tolist() result = chi2_contingency(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result[0] n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def eye(n): m = np.eye(n, n) m = Matrices.dense(n, n, m.flatten().tolist()) return m
from __future__ import print_function #Section 7.2.1 from pyspark.mllib.linalg import Vectors, Vector dv1 = Vectors.dense(5.0,6.0,7.0,8.0) dv2 = Vectors.dense([5.0,6.0,7.0,8.0]) sv = Vectors.sparse(4, [0,1,2,3], [5.0,6.0,7.0,8.0]) dv2[2] dv1.size dv2.toArray() from pyspark.mllib.linalg import Matrices dm = Matrices.dense(2,3,[5.0,0.0,0.0,3.0,1.0,4.0]) sm = Matrices.sparse(2,3,[0,1,2,4], [0,1,0,1], [5.0,3.0,1.0,4.0]) sm.toDense() dm.toSparse() dm[1,1] #Section 7.2.2 from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow rmind = IndexedRowMatrix(rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0]))) #Section 7.4 housingLines = sc.textFile("first-edition/ch07/housing.data", 6) housingVals = housingLines.map(lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")])) #Section 7.4.1 from pyspark.mllib.linalg.distributed import RowMatrix housingMat = RowMatrix(housingVals) from pyspark.mllib.stat._statistics import Statistics
if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % independenceTestResult) obs = sc.parallelize( [LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5])] ) # LabeledPoint(label, feature) # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Labelled point with a positive label and a dense feature vector. lp_pos = LabeledPoint(1.0, [5.0, 0.0, 1.0, 7.0]) # Labelled point with a negative label and a sparse feature vector. lp_neg = LabeledPoint(0.0, SparseVector(4, [0, 2, 3], [5.0, 1.0, 7.0])) # # Local Matrix # from pyspark.mllib.linalg import Matrix, Matrices # Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)) dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) # Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) # # Code Plan # # # 1- Combine all tweets files into a single data frame # 2- Parse the Tweets - remove stopwords - extract emoticons - extract url - normalize your words (e.g., mapping them to lowercase and removing punctuation and numbers) # 3- Feature extraction # 3a- Tokenisation # 3b- TF-IDF # 3c- Hash TF-IDF # 4- Run K-Means clustering
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics sc = SparkContext("local", "Rubbish") """ # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) """ # Sample vector composing of frequency of events vect = Vectors.dense([4,5,0,3]) # Summary of the test including the p-value, degrees of freedom, goodnessOfFitTestResult = Statistics.chiSqTest(vect) sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] matrix = Matrices.dense(3,4, sampleData) # Conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(matrix) # Test statistic, the method used, and the null hypothesis. print "SINGLE VECTOR FIT: " print goodnessOfFitTestResult ## Summary of the test including the p-value, degrees of freedom. print "INDEPENDENCE TEST RESULT: " print independenceTestResult