Exemplo n.º 1
0
    def test_matrix_independence(self):
        data = [
            40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0,
            12.0
        ]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          col_zero)
Exemplo n.º 2
0
 def loadBlockFromMatFile(filename):
     data = loadmat(filename, squeeze_me=True)
     id, G = data['block_id'], data['G']
     if isinstance(G, sparse.csc_matrix):
         sub_matrix = Matrices.sparse(p, p, G.indptr, G.indices, G.data)
     else:
         sub_matrix = Matrices.dense(p, p, G.transpose().flatten())
     return ((id[0], id[1]), sub_matrix)
Exemplo n.º 3
0
 def test_computeRowSums(self):
     dm1 = OldMatrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
     dm2 = OldMatrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
     dm3 = OldMatrices.dense(3, 2, [13, 14, 15, 16, 17, 18])
     dm4 = OldMatrices.dense(3, 2, [19, 20, 21, 22, 23, 24])
     blocks = self.sc.parallelize([((0, 0), dm1), ((0, 1), dm2),
                                   ((1, 0), dm3), ((1, 1), dm4)])
     mat = BlockMatrix(blocks, 3, 2)
     rowSums = sparkle.util._computeRowSums(mat)
     self.assertTrue(np.all(rowSums == [48, 66, 84, 102]))
Exemplo n.º 4
0
 def g(block):
     blockArr = block[1].toArray().ravel()
     newmat = OldMatrices.dense(
         block[1].numRows, block[1].numCols,
         np.sign(blockArr) * np.maximum(0,
                                        np.abs(blockArr) - rho))
     return (block[0], newmat)
Exemplo n.º 5
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     pivot_table = self._data_frame.stat.crosstab(
         "{}".format(targetDimension), testDimension)
     # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
     rdd = list(
         chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
     data_matrix = Matrices.dense(pivot_table.count(),
                                  len(pivot_table.columns) - 1, rdd)
     result = Statistics.chiSqTest(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result.statistic
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
Exemplo n.º 6
0
def mult(A, B):
    #-------LOG
    logging.warn("Multiplication started")
    blockcount = A.blocks.getNumPartitions()
    logging.warn("A part count")
    logging.warn(blockcount)
    blockcount = B.blocks.getNumPartitions()
    logging.warn("B part count")
    logging.warn(blockcount)
    #-----LOG

    # If dense, just call the inbuilt function.
    if (isinstance(A.blocks.first()[1], DenseMatrix)
            or isinstance(B.blocks.first()[1], DenseMatrix)):
        return A.multiply(B)
    #sparse ? Then continue the madness

    N = A.numRows()
    p = SQUARE_BLOCK_SIZE
    num_blocks = N / p

    aleft = A.blocks.flatMap(lambda x: affectLeft(x, num_blocks))
    bright = B.blocks.flatMap(lambda x: affectRight(x, num_blocks))
    both = aleft.union(bright)
    indi = both.reduceByKey(lambda a, b: prod(a, b))
    map = indi.map(lambda x: ((x[0][0], x[0][2]), x[1]))
    pr = map.reduceByKey(add)
    brd = pr.map(lambda x: ((x[0][0], x[0][
        1]), Matrices.sparse(p, p, x[1].indptr, x[1].indices, x[1].data)))
    C = BlockMatrix(brd, p, p, N, N)
    return C
Exemplo n.º 7
0
def dist_deter(i):
    global mat
    global rows
    global det
    # sum = 0
    matrix = []
    # mat = mat.toArray()
    for j in range(0, rows):
        for k in range(1, rows):
            if j != i:
                # z=0
                # print mat[k][j]
                matrix.append(mat[k][j])

    # print matrix, rows
    matrix = Matrices.dense(rows - 1, rows - 1, matrix)

    # print "siudfoi",i,matrix.toArray()
    # print "--------------------------"
    # print "mat[0,i] =",mat[0][i]
    # print "deter =",determinant(rows-1,matrix)

    sum = mat[0][i] * determinant(rows - 1, matrix)
    if i % 2 == 0:
        det += sum
    else:
        det += (0 - sum)
Exemplo n.º 8
0
def determinant(rows, mat):
    if rows <= 0:
        return "invalid"
    else:
        if rows == 1:
            return mat[0]
        elif rows == 2:
            mat = mat.toArray()
            # print "dsjfgbi", mat
            return mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]
        else:
            sum = 0
            mat = mat.toArray()
            for i in range(0, rows):
                matrix = []
                for j in range(0, rows):
                    for k in range(1, rows):
                        if j != i:
                            matrix.append(mat[k][j])

                # print matrix, rows
                matrix = Matrices.dense(rows - 1, rows - 1, matrix)

                # print "siudfoi",i,matrix.toArray()
                # print "--------------------------"
                # print "mat[0,i] =",mat[0][i]
                # print "deter =",determinant(rows-1,matrix)
                if i % 2 == 0:
                    sum = sum + mat[0][i] * determinant(rows - 1, matrix)
                else:
                    sum = sum - mat[0][i] * determinant(rows - 1, matrix)
                # print sum

            return sum
Exemplo n.º 9
0
def to_matrix(np_array):
    ''' Convert numpy array to MLlib Matrix '''
    if len(np_array.shape) == 2:
        return Matrices.dense(np_array.shape[0], np_array.shape[1],
                              np_array.ravel())
    else:
        raise Exception("""An MLLib Matrix can only be created
                        from a two-dimensional numpy array""")
Exemplo n.º 10
0
def to_matrix(np_array):
    ''' Convert numpy array to MLlib Matrix '''
    if len(np_array.shape) == 2:
        return Matrices.dense(np_array.shape[0],
                              np_array.shape[1],
                              np_array.ravel())
    else:
        raise Exception("""An MLLib Matrix can only be created from a two-dimensional numpy array""")
Exemplo n.º 11
0
def MapperLoadBlocksFromMatFile(filename):
    logging.warn('MapperLoadBlocksFromMatFile started %s ', filename)
    data = loadmat(filename)
    logging.warn('Loaded data')
    name = re.search('(\d+_\d+).mat$', filename, re.IGNORECASE).group(1)
    G = data[name]
    id = name.split('_')
    n = G.shape[0]
    logging.warn('Before sparse conversion')
    if (not (isinstance(G, sparse.csc_matrix))):
        sub_matrix = Matrices.dense(n, n, G.transpose().flatten())
    else:
        #sub_matrix = Matrices.dense(n,n,np.array(G.todense()).transpose().flatten())
        #SPARSE
        sub_matrix = Matrices.sparse(n, n, G.indptr, G.indices, G.data)
    logging.warn('MapperLoadBlocksFromMatFile Ended')
    return ((id[0], id[1]), sub_matrix)
Exemplo n.º 12
0
 def difun(x, vect):
     if x[0] == x[1]:
         sm = SparseMatrix(p, p, np.linspace(0, p, num = (p+1)), \
             np.linspace(0, p-1, num = p), vect[(x[0]*p):((x[0]+1)*p)])
         return (x, sm)
     else:
         h = sparse.csc_matrix((p, p))
         return (x, Matrices.sparse(p, p, h.indptr, h.indices, h.data))
Exemplo n.º 13
0
def to_matrix(np_array):
    if len(np_array.shape) == 2:
        return Matrices.dense(np_array.shape[0], np_array.shape[1],
                              np_array.ravel())
    else:
        raise Exception(
            'An MLLib Matrix can only be created from a two-dimensional numpy array'
        )
Exemplo n.º 14
0
def normalizeLaplacian(block, d1):
    I, J = block[0]
    mat, p = block[1].toArray(), SQUARE_BLOCK_SIZE
    L = np.zeros((p, p))
    for i in range(p):
        for j in range(p):
            L[i, j] = mat[i, j] * d1[I * p + i] * d1[J * p + j]
    nomalizedL = Matrices.dense(p, p, L.transpose().flatten())
    return (block[0], nomalizedL)
Exemplo n.º 15
0
def to_matrix(np_array):
    """Convert numpy array to MLlib Matrix
    """
    if len(np_array.shape) == 2:
        return Matrices.dense(np_array.shape[0], np_array.shape[1],
                              np_array.ravel())
    else:
        raise Exception(
            "An MLLib Matrix can only be created from a two-dimensional " +
            "numpy array, got {}".format(len(np_array.shape)))
Exemplo n.º 16
0
def main():
    if len(sys.argv) < 2:
        print('USAGE: matrix_mult.py <dim of matrix>')
        return

    n = int(sys.argv[1])
    dm2 = Matrices.dense(n, n, np.random.randint(1, n * n, n * n).tolist())
    blocks1 = sc.parallelize([((0, 0), dm2)])
    m2 = BlockMatrix(blocks1, n, n)
    m3 = BlockMatrix(blocks1, n, n)
    ret = m3.multiply(m2).toIndexedRowMatrix().toRowMatrix().rows.collect()
    print('****************n:', n)
Exemplo n.º 17
0
    def convertMahoutToSparkMatrix(mahoutMatrix):
        """
        For compatible use

        :param mahoutMatrix:
        :return:
        """
        rows, cols = mahoutMatrix.shape

        # remember to take the transpose since denseMatrix is column major
        ret = Matrices.dense(rows, cols, mahoutMatrix.transpose().flatten().tolist())
        return ret
Exemplo n.º 18
0
 def g(block):
     i, j = block[0]
     mat = block[1].toArray()
     n, m = mat.shape
     col0 = colsPerBlock.value * j
     blockCenter = cb.value if np.isscalar(
         cb.value) else cb.value[col0:(col0 + m)]
     blockScale = sb.value if np.isscalar(
         sb.value) else sb.value[col0:(col0 + m)]
     newmat = (mat - blockCenter) / blockScale
     newmat = OldMatrices.dense(n, m, newmat.ravel(order='F'))
     return ((i, j), newmat)
Exemplo n.º 19
0
    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
Exemplo n.º 20
0
def eucledianDistances(ZblockPair):
    I, J = int(ZblockPair[0][0]), int(ZblockPair[1][0])
    blockI, blockJ = ZblockPair[0][1], ZblockPair[1][1]
    allCombinations = itertools.product(blockI, blockJ)
    allCombsEdges = [np.linalg.norm(p[0] - p[1]) for p in allCombinations]
    n = blockI.shape[0]
    if len(allCombsEdges) == (n * n):
        adj = np.reshape(allCombsEdges, (n, n))
    else:
        adj = np.zeros((n, n))
    G = Matrices.dense(n, n, adj.transpose().flatten())
    return ((I, J), G)
Exemplo n.º 21
0
def CreateInputs(input_case):
    data_file = '/u/vparames/TESTS/3/test-commute-dist-' + str(
        input_case) + '.mat'
    inp = loadmat(data_file)
    adj_mat = inp['G']
    edge_list = inp['elist']
    n = adj_mat.shape[0]
    sm = Matrices.dense(n, n, adj_mat.transpose().flatten())
    adjacency_mat = BlockMatrix(sc.parallelize([((0, 0), sm)]),
                                SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE)

    return adjacency_mat, edge_list
Exemplo n.º 22
0
def difun(x, vect):
    if (x[0] == x[1]):
        sm = SparseMatrix(
            SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE,
            np.linspace(0, SQUARE_BLOCK_SIZE, num=(SQUARE_BLOCK_SIZE + 1)),
            np.linspace(0, SQUARE_BLOCK_SIZE - 1, num=SQUARE_BLOCK_SIZE),
            vect[(x[0] * SQUARE_BLOCK_SIZE):((x[0] + 1) * SQUARE_BLOCK_SIZE)])
        return (x, sm)
    else:
        h = sparse.csc_matrix((SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE))
        return (x,
                Matrices.sparse(SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, h.indptr,
                                h.indices, h.data))
Exemplo n.º 23
0
def main():
	if len(sys.argv) < 2:
		print('USAGE: matrix_mult.py <dim of matrix>')
		return 
	
	
	n = int(sys.argv[1])
	dm2 = Matrices.dense(n, n, np.random.randint(1, n * n, n * n).tolist())
	blocks1 = sc.parallelize([((0,0), dm2)])
	m2 = BlockMatrix(blocks1, n,n)
	m3 = BlockMatrix(blocks1, n,n)
	ret = m3.multiply(m2).toIndexedRowMatrix().toRowMatrix().rows.collect()
	print('****************n:', n)
Exemplo n.º 24
0
 def difun(self, x, vect):
     squareBlockSize = copy.deepcopy(self.squareBlockSize)
     if (x[0] == x[1]):
         sm = SparseMatrix(
             squareBlockSize, squareBlockSize,
             np.linspace(0, squareBlockSize, num=(squareBlockSize + 1)),
             np.linspace(0, squareBlockSize - 1, num=squareBlockSize),
             vect[(x[0] * squareBlockSize):((x[0] + 1) * squareBlockSize)])
         return (x, sm)
     else:
         h = sparse.csc_matrix((squareBlockSize, squareBlockSize))
         return (x,
                 Matrices.sparse(squareBlockSize, squareBlockSize, h.indptr,
                                 h.indices, h.data))
Exemplo n.º 25
0
def constructElectionBlock(pairDonations):
    I = int(pairDonations[0][0])
    J = int(pairDonations[1][0])
    donationsI = pairDonations[0][1]
    donationsJ = pairDonations[1][1]

    n = donationsI.shape[0]
    allCombinations = itertools.product(donationsI, donationsJ)
    allCombsEdges = [edgeDefinitionElection(p[0], p[1]) for p in allCombinations]
    if len(allCombsEdges) == (n*n):
        adj = np.reshape(allCombsEdges, (n,n))
    else:
        adj = np.zeros((n,n))
    if I==J:
        adj[range(n), range(n)] = 0

    if GENERATE_SPARSE:
        G = sparse.csc_matrix(adj)
        subMatrixSparse = Matrices.sparse(n, n, G.indptr, G.indices, G.data)
        return ((I,J), subMatrixSparse)
    else:
        G = Matrices.dense(n,n, adj.transpose().flatten())
        return ((I,J), G)
Exemplo n.º 26
0
def createAdjMatToy(graphNodes, year, sparseG, blockSize, sc):
    index = (year > 12) + 1 # 12 maps to 1, 16 maps to 2
    path = 'BASE_PATH/toy_example/'
    A = loadmat(path + 'toy_A' + str(index) + '.mat')['G']
    n = A.shape[0]
    p = n / 2
    subMatrices = [A[:p,:p], A[:p,p:], A[p:,:p], A[p:,p:]]
    ids, blocks = [(0,0), (0,1), (1,0), (1,1)], []
    for i, id in enumerate(ids):
        adj = subMatrices[i]
        G = Matrices.dense(p,p, adj.transpose().flatten())
        blocks.append((id, G))
    blocksRdd = sc.parallelize(blocks, len(ids))
    return blocksRdd    
Exemplo n.º 27
0
 def radialBasisBlock(pairData):
     I, J = int(pairData[0][0]), int(pairData[1][0])
     dataI, dataJ = pairData[0][1], pairData[1][1]
     n = len(dataI)
     allCombinations = itertools.product(dataI, dataJ)
     allCombsEdges = [radialBasisKernel(p[0], p[1]) for p in allCombinations]
     print 'allCombsEdges ', len(allCombsEdges), (n*n)
     if len(allCombsEdges) == (n*n):
         adj = np.reshape(allCombsEdges, (n,n))
     else:
         adj = np.zeros((n,n))
     if I==J:
         adj[range(n), range(n)] = 0
     G = Matrices.dense(n,n, adj.transpose().flatten())
     return ((I,J), G)
Exemplo n.º 28
0
def _colVectorToBlockMatrix(vec, rowsPerBlock, numSlices=None):
    sc = SparkContext.getOrCreate()
    remainder = len(vec) % rowsPerBlock
    if rowsPerBlock >= len(vec):
        splits = [vec]
    elif remainder == 0:
        splits = np.split(vec, len(vec) // rowsPerBlock)
    else:
        head = vec[:-remainder]
        splits = np.split(head, len(head) // rowsPerBlock)
        splits.append(vec[-remainder:])
    blocks = sc.parallelize([((i, 0), OldMatrices.dense(len(split), 1, split))
                             for i, split in zip(range(len(splits)), splits)],
                            numSlices=numSlices)
    return BlockMatrix(blocks, rowsPerBlock, 1, len(vec), 1)
Exemplo n.º 29
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1],
                                       [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4],
                                     True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4],
                                    True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
Exemplo n.º 30
0
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        df = self._data_frame.withColumn(
            testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
        measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values)
        if float(measureSummaryDict["count"]) > 10:
            maxval = float(measureSummaryDict["max"])
            minval = float(measureSummaryDict["min"])
            step = (maxval - minval) / 5.0
            splits = [
                math.floor(minval), minval + step, minval + (step * 2),
                minval + (step * 3), minval + (step * 4),
                math.ceil(maxval)
            ]
            bucketizer = Bucketizer(splits=splits,
                                    inputCol=testMeasure,
                                    outputCol="bucketedColumn")
            # bucketedData = bucketizer.transform(df)
            bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure))
            pivot_table = bucketedData.stat.crosstab(
                "{}".format(targetDimension), 'bucketedColumn')
        else:
            pivot_table = df.stat.crosstab("{}".format(targetDimension),
                                           testMeasure)

        rdd = list(
            chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
        data_matrix = Matrices.dense(pivot_table.count(),
                                     len(pivot_table.columns) - 1, rdd)
        result = Statistics.chiSqTest(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result.statistic
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result
Exemplo n.º 31
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
Exemplo n.º 32
0
def create_design_mat(tuple):
    twtname = tuple[0]
    word_tfs_str = tuple[1]
    i = 0
    twt_word_ind_dict = {}

    for spl in word_tfs_str.split(inter_tweet_delim):
        word = spl.split(in_tweet_delim)[0]
        tfidf = float(spl.split(in_tweet_delim)[1].strip())
        ind = word_index_dict_bcast.value[word]
        twt_word_ind_dict[ind] = tfidf
    design_row = []
    while i < total_tweet_words_bcast.value:
        design_row.append(twt_word_ind_dict.get(i, float(0)))
        i = i + 1
    sparse_des_row = Matrices.dense(1, total_tweet_words_bcast.value,design_row).toSparse()
    # return the float design matrix rw for tweetname
    return (twtname, sparse_des_row)
Exemplo n.º 33
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     if self._pandas_flag:
         pivot_table = pd.crosstab([self._data_frame[targetDimension]],
                                   self._data_frame[testDimension])
         try:
             data_matrix = np.array(
                 pivot_table.as_matrix(columns=None)).astype(np.int)
         except:
             data_matrix = np.array(pivot_table.values).astype(np.int)
     else:
         pivot_table = self._data_frame.stat.crosstab(
             "{}".format(targetDimension), testDimension)
         # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
         rdd = list(
             chain(*list(
                 zip(*pivot_table.drop(pivot_table.columns[0]).collect()))))
         data_matrix = Matrices.dense(pivot_table.count(),
                                      len(pivot_table.columns) - 1, rdd)
         data_matrix = data_matrix.toArray().tolist()
     result = chi2_contingency(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result[0]
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
Exemplo n.º 34
0
 def eye(n):
     m = np.eye(n, n)
     m = Matrices.dense(n, n, m.flatten().tolist())
     return m
Exemplo n.º 35
0
from __future__ import print_function

#Section 7.2.1
from pyspark.mllib.linalg import Vectors, Vector
dv1 = Vectors.dense(5.0,6.0,7.0,8.0)
dv2 = Vectors.dense([5.0,6.0,7.0,8.0])
sv = Vectors.sparse(4, [0,1,2,3], [5.0,6.0,7.0,8.0])
dv2[2]
dv1.size
dv2.toArray()

from pyspark.mllib.linalg import Matrices

dm = Matrices.dense(2,3,[5.0,0.0,0.0,3.0,1.0,4.0])
sm = Matrices.sparse(2,3,[0,1,2,4], [0,1,0,1], [5.0,3.0,1.0,4.0])
sm.toDense()
dm.toSparse()
dm[1,1]

#Section 7.2.2
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow
rmind = IndexedRowMatrix(rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0])))

#Section 7.4
housingLines = sc.textFile("first-edition/ch07/housing.data", 6)
housingVals = housingLines.map(lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")]))

#Section 7.4.1
from pyspark.mllib.linalg.distributed import RowMatrix
housingMat = RowMatrix(housingVals)
from pyspark.mllib.stat._statistics import Statistics
if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize(
        [LabeledPoint(1.0, [1.0, 0.0, 3.0]),
         LabeledPoint(1.0, [1.0, 2.0, 0.0]),
         LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
    )  # LabeledPoint(label, feature)

    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Labelled point with a positive label and a dense feature vector.
lp_pos = LabeledPoint(1.0, [5.0, 0.0, 1.0, 7.0])

# Labelled point with a negative label and a sparse feature vector.
lp_neg = LabeledPoint(0.0, SparseVector(4, [0, 2, 3], [5.0, 1.0, 7.0]))

#
# Local Matrix
#
from pyspark.mllib.linalg import Matrix, Matrices

# Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])

# Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

#
# Code Plan
#
#
# 1- Combine all tweets files into a single data frame
# 2- Parse the Tweets - remove stopwords - extract emoticons - extract url - normalize your words (e.g., mapping them to lowercase and removing punctuation and numbers)
# 3- Feature extraction
# 		3a- Tokenisation
# 		3b- TF-IDF
# 		3c- Hash TF-IDF
# 4- Run K-Means clustering
Exemplo n.º 38
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics


sc = SparkContext("local", "Rubbish")

"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4,5,0,3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
matrix = Matrices.dense(3,4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)


# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult 
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult