示例#1
0
def to_matrix(np_array):
    ''' Convert numpy array to MLlib Matrix '''
    if len(np_array.shape) == 2:
        return Matrices.dense(np_array.shape[0],
                              np_array.shape[1],
                              np_array.ravel())
    else:
        raise Exception("""An MLLib Matrix can only be created from a two-dimensional numpy array""")
示例#2
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
示例#3
0
 def g(block):
     i, j = block[0]
     mat = block[1].toArray()
     n, m = mat.shape
     col0 = colsPerBlock.value * j
     blockCenter = cb.value if np.isscalar(
         cb.value) else cb.value[col0:(col0 + m)]
     blockScale = sb.value if np.isscalar(
         sb.value) else sb.value[col0:(col0 + m)]
     newmat = (mat - blockCenter) / blockScale
     newmat = OldMatrices.dense(n, m, newmat.ravel(order='F'))
     return ((i, j), newmat)
示例#4
0
def CreateInputs(input_case):
    data_file = '/u/vparames/TESTS/3/test-commute-dist-' + str(
        input_case) + '.mat'
    inp = loadmat(data_file)
    adj_mat = inp['G']
    edge_list = inp['elist']
    n = adj_mat.shape[0]
    sm = Matrices.dense(n, n, adj_mat.transpose().flatten())
    adjacency_mat = BlockMatrix(sc.parallelize([((0, 0), sm)]),
                                SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE)

    return adjacency_mat, edge_list
示例#5
0
    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
示例#6
0
文件: tests.py 项目: greatyan/spark
    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
示例#7
0
def main():
	if len(sys.argv) < 2:
		print('USAGE: matrix_mult.py <dim of matrix>')
		return 
	
	
	n = int(sys.argv[1])
	dm2 = Matrices.dense(n, n, np.random.randint(1, n * n, n * n).tolist())
	blocks1 = sc.parallelize([((0,0), dm2)])
	m2 = BlockMatrix(blocks1, n,n)
	m3 = BlockMatrix(blocks1, n,n)
	ret = m3.multiply(m2).toIndexedRowMatrix().toRowMatrix().rows.collect()
	print('****************n:', n)
示例#8
0
def createAdjMatToy(graphNodes, year, sparseG, blockSize, sc):
    index = (year > 12) + 1 # 12 maps to 1, 16 maps to 2
    path = 'BASE_PATH/toy_example/'
    A = loadmat(path + 'toy_A' + str(index) + '.mat')['G']
    n = A.shape[0]
    p = n / 2
    subMatrices = [A[:p,:p], A[:p,p:], A[p:,:p], A[p:,p:]]
    ids, blocks = [(0,0), (0,1), (1,0), (1,1)], []
    for i, id in enumerate(ids):
        adj = subMatrices[i]
        G = Matrices.dense(p,p, adj.transpose().flatten())
        blocks.append((id, G))
    blocksRdd = sc.parallelize(blocks, len(ids))
    return blocksRdd    
示例#9
0
def _colVectorToBlockMatrix(vec, rowsPerBlock, numSlices=None):
    sc = SparkContext.getOrCreate()
    remainder = len(vec) % rowsPerBlock
    if rowsPerBlock >= len(vec):
        splits = [vec]
    elif remainder == 0:
        splits = np.split(vec, len(vec) // rowsPerBlock)
    else:
        head = vec[:-remainder]
        splits = np.split(head, len(head) // rowsPerBlock)
        splits.append(vec[-remainder:])
    blocks = sc.parallelize([((i, 0), OldMatrices.dense(len(split), 1, split))
                             for i, split in zip(range(len(splits)), splits)],
                            numSlices=numSlices)
    return BlockMatrix(blocks, rowsPerBlock, 1, len(vec), 1)
示例#10
0
 def radialBasisBlock(pairData):
     I, J = int(pairData[0][0]), int(pairData[1][0])
     dataI, dataJ = pairData[0][1], pairData[1][1]
     n = len(dataI)
     allCombinations = itertools.product(dataI, dataJ)
     allCombsEdges = [radialBasisKernel(p[0], p[1]) for p in allCombinations]
     print 'allCombsEdges ', len(allCombsEdges), (n*n)
     if len(allCombsEdges) == (n*n):
         adj = np.reshape(allCombsEdges, (n,n))
     else:
         adj = np.zeros((n,n))
     if I==J:
         adj[range(n), range(n)] = 0
     G = Matrices.dense(n,n, adj.transpose().flatten())
     return ((I,J), G)
示例#11
0
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        df = self._data_frame.withColumn(
            testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
        measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values)
        if float(measureSummaryDict["count"]) > 10:
            maxval = float(measureSummaryDict["max"])
            minval = float(measureSummaryDict["min"])
            step = (maxval - minval) / 5.0
            splits = [
                math.floor(minval), minval + step, minval + (step * 2),
                minval + (step * 3), minval + (step * 4),
                math.ceil(maxval)
            ]
            bucketizer = Bucketizer(splits=splits,
                                    inputCol=testMeasure,
                                    outputCol="bucketedColumn")
            # bucketedData = bucketizer.transform(df)
            bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure))
            pivot_table = bucketedData.stat.crosstab(
                "{}".format(targetDimension), 'bucketedColumn')
        else:
            pivot_table = df.stat.crosstab("{}".format(targetDimension),
                                           testMeasure)

        rdd = list(
            chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
        data_matrix = Matrices.dense(pivot_table.count(),
                                     len(pivot_table.columns) - 1, rdd)
        result = Statistics.chiSqTest(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result.statistic
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result
示例#12
0
def MapperLoadBlocksFromMatFile(filename):
    logging.warn('MapperLoadBlocksFromMatFile started %s ', filename)
    data = loadmat(filename)
    logging.warn('Loaded data')
    name = re.search('(\d+_\d+).mat$', filename, re.IGNORECASE).group(1)
    G = data[name]
    id = name.split('_')
    n = G.shape[0]
    logging.warn('Before sparse conversion')
    if (not (isinstance(G, sparse.csc_matrix))):
        sub_matrix = Matrices.dense(n, n, G.transpose().flatten())
    else:
        #sub_matrix = Matrices.dense(n,n,np.array(G.todense()).transpose().flatten())
        #SPARSE
        sub_matrix = Matrices.sparse(n, n, G.indptr, G.indices, G.data)
    logging.warn('MapperLoadBlocksFromMatFile Ended')
    return ((id[0], id[1]), sub_matrix)
示例#13
0
def create_design_mat(tuple):
    twtname = tuple[0]
    word_tfs_str = tuple[1]
    i = 0
    twt_word_ind_dict = {}

    for spl in word_tfs_str.split(inter_tweet_delim):
        word = spl.split(in_tweet_delim)[0]
        tfidf = float(spl.split(in_tweet_delim)[1].strip())
        ind = word_index_dict_bcast.value[word]
        twt_word_ind_dict[ind] = tfidf
    design_row = []
    while i < total_tweet_words_bcast.value:
        design_row.append(twt_word_ind_dict.get(i, float(0)))
        i = i + 1
    sparse_des_row = Matrices.dense(1, total_tweet_words_bcast.value,design_row).toSparse()
    # return the float design matrix rw for tweetname
    return (twtname, sparse_des_row)
示例#14
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     if self._pandas_flag:
         pivot_table = pd.crosstab([self._data_frame[targetDimension]],
                                   self._data_frame[testDimension])
         try:
             data_matrix = np.array(
                 pivot_table.as_matrix(columns=None)).astype(np.int)
         except:
             data_matrix = np.array(pivot_table.values).astype(np.int)
     else:
         pivot_table = self._data_frame.stat.crosstab(
             "{}".format(targetDimension), testDimension)
         # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
         rdd = list(
             chain(*list(
                 zip(*pivot_table.drop(pivot_table.columns[0]).collect()))))
         data_matrix = Matrices.dense(pivot_table.count(),
                                      len(pivot_table.columns) - 1, rdd)
         data_matrix = data_matrix.toArray().tolist()
     result = chi2_contingency(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result[0]
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
示例#15
0
def constructElectionBlock(pairDonations):
    I = int(pairDonations[0][0])
    J = int(pairDonations[1][0])
    donationsI = pairDonations[0][1]
    donationsJ = pairDonations[1][1]

    n = donationsI.shape[0]
    allCombinations = itertools.product(donationsI, donationsJ)
    allCombsEdges = [edgeDefinitionElection(p[0], p[1]) for p in allCombinations]
    if len(allCombsEdges) == (n*n):
        adj = np.reshape(allCombsEdges, (n,n))
    else:
        adj = np.zeros((n,n))
    if I==J:
        adj[range(n), range(n)] = 0

    if GENERATE_SPARSE:
        G = sparse.csc_matrix(adj)
        subMatrixSparse = Matrices.sparse(n, n, G.indptr, G.indices, G.data)
        return ((I,J), subMatrixSparse)
    else:
        G = Matrices.dense(n,n, adj.transpose().flatten())
        return ((I,J), G)
示例#16
0
from pyspark.mllib.linalg import Vectors, Matrices
from pyspark.feature import LabeledPoint

vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]

spark_vector = Vectors.dense(vector)

label = 45.0
labeled_point = LabeledPoint(label, vector)
spark_matrix = Matrices.dense(3, 2, vector)
if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize(
        [LabeledPoint(1.0, [1.0, 0.0, 3.0]),
         LabeledPoint(1.0, [1.0, 2.0, 0.0]),
         LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
    )  # LabeledPoint(label, feature)

    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
示例#18
0
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Labelled point with a positive label and a dense feature vector.
lp_pos = LabeledPoint(1.0, [5.0, 0.0, 1.0, 7.0])

# Labelled point with a negative label and a sparse feature vector.
lp_neg = LabeledPoint(0.0, SparseVector(4, [0, 2, 3], [5.0, 1.0, 7.0]))

#
# Local Matrix
#
from pyspark.mllib.linalg import Matrix, Matrices

# Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])

# Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

#
# Code Plan
#
#
# 1- Combine all tweets files into a single data frame
# 2- Parse the Tweets - remove stopwords - extract emoticons - extract url - normalize your words (e.g., mapping them to lowercase and removing punctuation and numbers)
# 3- Feature extraction
# 		3a- Tokenisation
# 		3b- TF-IDF
# 		3c- Hash TF-IDF
# 4- Run K-Means clustering
示例#19
0
def findFeatures(inputFileName, outputFileName):

    inpFile = sc.textFile(inputFileName)

    numRows = inpFile.count()

    print('\nRead ', numRows, ' rows from ', inputFileName, '\n')

    print('Print out a few rows read from file')

    print('\n', inpFile.take(5), '\n')

    # Rectangularize the RDD before vectorizing

    # Filter elements to remove quotes to prevent (quote) embedded commas

    countFields = inpFile.map(lambda s: removeEmbeddedCommas(s)).map(
        lambda s: len(s.split(','))).collect()

    print('number of fields in each row (first few): ', countFields[0:4])

    RectangularizationNeeded = False
    maxCount = 0
    maxCountAt = 0

    for i in range(len(countFields)):
        if (countFields[i] > maxCount):
            maxCount = countFields[i]
            maxCountAt = i
        if (i > 0) and (RectangularizationNeeded == False):
            if (countFields[i] != countFields[i - 1]):
                RectangularizationNeeded = True

    if (RectangularizationNeeded == True):
        print('Identified jagged data set; Rectangularization needed')
    else:
        print('Identified rectangular data set')

    print('Inferring longest row(s) has ', maxCount, ' fields at row ',
          maxCountAt)

    inpFileRe = inpFile.map(lambda s: removeEmbeddedCommas(s)).map(
        lambda s: s + ',No Data')
    # remove short rows
    shortFile = inpFileRe.filter(
        lambda row: len(row.split(',')) < maxCount + 1)
    print("Short rows will be filtered out")
    print('\n', shortFile.take(10), '\n')
    # truncate to maxCount+1 columns
    inpFileTr = inpFileRe.filter(
        lambda row: len(row.split(',')) == maxCount + 1)
    print('\n', inpFileTr.take(5), '\n')

    header = inpFileTr.first()
    hL = header.split(',')

    inpFileNh = inpFileTr.filter(lambda row: row != header)

    print('Removed the First row as Header')
    numRows = inpFileNh.count()
    print('number of rows = ', numRows)

    from pyspark.mllib.linalg import Matrix, Matrices
    from pyspark.mllib.linalg import Vector, Vectors

    # parsedData will be org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]
    parsedData = inpFileNh.map(
        lambda s: Vectors.dense([with0Str(t) for t in s.split(',')]))
    print('\nprint out a few vectors after converting from strings\n')
    print(parsedData.take(5))

    from pyspark.mllib.stat import MultivariateStatisticalSummary, Statistics

    summary = Statistics.colStats(parsedData)

    print('\nprint out summary statistics, for each column\n')

    print('summary.mean')
    print(summary.mean())
    print('summary.variance')
    print(summary.variance())
    print('summary.count')
    print(summary.count())
    print('summary.max')
    print(summary.max())
    print('summary.min')
    print(summary.min())
    print('summary.normL1')
    print(summary.normL1())
    print('summary.normL2')
    print(summary.normL2())
    print('summary.numnonZeros')
    print(summary.numNonzeros())
    print()

    numCols = len(summary.mean())

    typeStrings = [' '] * numCols

    # infer columns where normL1, normL2, mean, variance, max and mean are 0 as non-numeric

    print('Inferring column data types:')

    import math

    for j in range(numCols):
        if ((summary.normL1()[j] == 0.0) and (summary.normL2()[j] == 0.0)
                and (summary.mean()[j] == 0.0)
                and (summary.variance()[j] == 0.0)
                and (summary.max()[j] == 0.0) and (summary.min()[j] == 0.0)):
            typeStrings[j] = 'String'
        else:
            if ((math.trunc(summary.normL1()[j]) == summary.normL1()[j])
                    and (math.trunc(summary.max()[j]) == summary.max()[j])
                    and (math.trunc(summary.min()[j]) == summary.min()[j])):
                typeStrings[j] = 'Int'
            else:
                typeStrings[j] = 'Float'

        print(typeStrings[j], end=',')

    print('\n\n')

    #******************************************************************************
    # take out the 'String' columns before calling Statistics.corr()

    numNumericCols = 0
    for j in range(numCols):
        if (typeStrings[j] != 'String'):
            numNumericCols = numNumericCols + 1

    noStrings = inpFileNh.map(
        lambda s: Vectors.dense(removeStrings(s, numNumericCols)))
    print(noStrings.take(5))

    correlMatrix = Statistics.corr(noStrings, method='pearson')

    print('Computing Correlation Matrix on all columns')
    print(
        'Printing out column names that have correlation coefficient > 0.5 or < -0.5'
    )

    for i in range(numNumericCols):
        for j in range(i):
            if (((correlMatrix[i][j] >= 0.5) or (correlMatrix[i][j] <= -0.5))
                    and (i != j)):
                print(hA[i], hA[j], correlMatrix[i][j])

#******************************************************************************
#******************************************************************************

# create a contingency matrix

    LoLoF = [[0.0 for x in range(numNumericCols)] for y in range(numRows)]

    LoLoF = noStrings.collect()

    pdLinArr = [0.0 for x in range(numNumericCols * numRows)]

    for i in range(numRows):
        for j in range(numNumericCols):
            pdLinArr[i * numNumericCols + j] = abs(LoLoF[i][j])

    mat = Matrices.dense(numRows, numNumericCols, pdLinArr)

    # conduct Pearson's independence test on the input contingency matrix
    print(
        "Computing Pearson's independence test on the input contingency matrix using chi-square test"
    )

    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom
    print('%s\n' % independenceTestResult)

    #*******************************************************************************

    stdDev = [0.0] * numCols

    for j in range(numCols):
        stdDev[j] = math.sqrt(summary.variance()[j])

#*******************************************************************************
#   test for normal distribution using Kolmogorov-Smirnov test
#
    colVec = [0.0] * numRows

    #vecRDD = sc.parallelize(colVec)
    #testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', 0, 1)
    #print(testResult)

    numericMean = [0.0] * numNumericCols
    numericSD = [0.0] * numNumericCols

    k = 0
    for j in range(numCols):
        if ((summary.mean()[j] != 0.0) and (summary.variance()[j] != 0.0)):
            numericMean[k] = summary.mean()[j]
            numericSD[k] = stdDev[j]
            k = k + 1

    print(
        'Checking if column data is normally distributed using Kolmogorov-Smirnov test'
    )

    for j in range(numNumericCols):
        for i in range(numRows):
            # see https://issues.apache.org/jira/browse/SPARK-20802
            # test fails if data is normally distributed
            # kolmogorovSmirnovTest in pyspark.mllib.stat.Statistics throws net.razorvine.pickle.PickleException
            # when input data is normally distributed (no error when data is not normally distributed)
            colVec[i] = float(i)  # LoLoF[i][j]
        vecRDD = sc.parallelize(colVec)
        print(colVec[0], colVec[numRows - 1], numericMean[j], numericSD[j])
        testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm',
                                                      numericMean[j],
                                                      numericSD[j])
        print(testResult)

#*******************************************************************************
#*******************************************************************************
#
#   estimate kernel densities
#
    from pyspark.mllib.stat import KernelDensity

    # colVec = [0.0]*numRows
    # vecRDD = sc.parallelize(colVec)

    print('Computing kernel densities on all columns using a Bandwidth of 3.0')

    kd = KernelDensity()
    kd.setSample(vecRDD)
    kd.setBandwidth(3.0)

    sAS = int(math.sqrt(numRows))  # sample array size
    samplePoints = [0.0] * sAS
    #samplePoints = [0.0]*numRows

    for i in range(sAS):
        samplePoints[i] = float(i * sAS)
    #for i in range(numRows):
    #   samplePoints[i] = float(i)

    densities = kd.estimate(samplePoints)

    print('Estimating kernel densities')

    print('Print kernel densities at sample points')
    #print('Print kernel densities > 0.01 at sample points')
    for j in range(numNumericCols):
        # print( hL[j])
        for i in range(numRows):
            # see https://issues.apache.org/jira/browse/SPARK-20803
            # KernelDensity.estimate in pyspark.mllib.stat.KernelDensity throws
            # net.razorvine.pickle.PickleException when input data is normally
            # distributed (no error when data is not normally distributed)
            colVec[i] = float(i)  # LoLoF[i][j]
        vecRDD = sc.parallelize(colVec)
        kd = KernelDensity()
        kd.setSample(vecRDD)
        kd.setBandwidth(3.0)
        # Find density estimates for the given values
        densities = kd.estimate(samplePoints)
        for i in range(sAS):
            print(densities[i], end=',')
        print()
        #for i in range(numRows):
        #   if (densities[i] >= 0.01):
        #       print(i, densities[i], end=',')
        print()

#*******************************************************************************

#*******************************************************************************
#
#  compute Skewness and Kurtosis for each numeric column
#
    skew = [0.0] * numNumericCols
    kurt = [0.0] * numNumericCols
    term = 0.0

    k = 0
    for j in range(numCols):
        if (typeStrings[j] != 'String'):
            skew[k] = 0.0
            kurt[k] = 0.0
            # extra work: find Ints
            typeStrings[j] = 'Int'
            meanj = summary.mean()[j]
            for i in range(numRows):
                if ((typeStrings[j] == 'Int')
                        and (math.trunc(LoLoF[i][k]) != LoLoF[i][k])):
                    typeStrings[j] = 'Float'
                term = (LoLoF[i][k] - meanj) / stdDev[j]
                skew[k] = skew[k] + (term * term * term)
                kurt[k] = kurt[k] + (term * term * term * term)
            skew[k] = skew[k] / numRows
            kurt[k] = (kurt[k] / numRows) - 3.0
            k = k + 1

    print('Skewness of columns')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            print(skew[k], end=',')
            k = k + 1
    print()

    print('Kurtosis of columns')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            print(kurt[k], end=',')
            k = k + 1
    print()

    print('Inferring column data types (Text string, Int, Float)')

    # numbers that are Int and non-negative and  "large" are likely to be numeric labels -- keep checking this heuristic
    # columns that are outside Kurtosis limits <-1.2, 3.0> may be numeric labels

    print('Attempting to infer if an Int column is a numeric label')
    print("If all Ints in a column are >= 0 and 'large', it may be numLabel")
    print(
        'If all Ints in a column are >= 0 and excess kurtosis is outside [-1.2, 3.0], it may be numLabel'
    )

    for j in range(numCols):
        if ((typeStrings[j] == 'Int') and (summary.min()[j] >= 0)
                and ((summary.max()[j] > 10000) or (kurt[j] < -1.2) or
                     (kurt[j] > 3.0))):
            print('column ' + j + ' (' + hA[j] + ') ' +
                  ' may be a numeric label')
            typeStrings[j] = 'NumLabel'


#******************************************************************************
#******************************************************************************
#
#   Normalize the dataset by shifting by mean and scaling by stdDev
#
    normData = [[0.0 for x in range(numNumericCols)] for y in range(numRows)]
    rowMaxs = [0.0] * numRows
    rowMins = [0.0] * numRows
    rowNormL1s = [0.0] * numRows
    rowNormL2s = [0.0] * numRows
    rowNumZeros = [0] * numRows
    means = [0.0] * numCols

    for j in range(numCols):
        means[j] = summary.mean()[j]

    for i in range(numRows):
        rowMaxs[i] = -999999.0
        rowMins[i] = 999999.0
        rowNumZeros[i] = 0
        rowNormL1s[i] = 0.0
        rowNormL2s[i] = 0.0

        k = 0
        for j in range(numCols):
            if ((typeStrings[j] == 'Int') or (typeStrings[j] == 'Float')):
                normData[i][k] = (LoLoF[i][k] - means[j]) / stdDev[j]
                if (normData[i][k] > rowMaxs[i]):
                    rowMaxs[i] = normData[i][k]
                if (normData[i][k] < rowMins[i]):
                    rowMins[i] = normData[i][k]
                if (normData[i][k] == 0.0):
                    rowNumZeros[i] = rowNumZeros
                if (abs(normData[i][k]) < 100.0):
                    rowNormL1s[i] = rowNormL1s[i] + abs(normData[i][k])
                    rowNormL2s[
                        i] = rowNormL2s[i] + normData[i][k] * normData[i][k]
            # print(i,j,k, LoLoF[i][k], means[j], stdDev[j], normData[i][k], rowNormL1s[i], rowNormL2s[i])
                k = k + 1

    input = open(inputFileName, 'r')
    fileHandle = open('/home/bsrsharma/work/python/rowNormL1L2.csv', 'w')

    # Keep upto 6 columns of identifying info
    if (numCols > 1):
        for j in range(min(5, numCols)):
            fileHandle.write(hL[j])
            fileHandle.write(',')
    fileHandle.write('L1-Norm')
    fileHandle.write(",")
    fileHandle.write('L2-Norm\n')

    s = input.readline()  # don't repeat header

    for i in range(numRows):
        # copy input to output
        s = input.readline()
        LoS = s.split(',')
        for j in range(min(5, numCols)):
            fileHandle.write(LoS[j])
            fileHandle.write(',')
        fileHandle.write('%s' % rowNormL1s[i])
        fileHandle.write(',')
        fileHandle.write('%s' % math.sqrt(rowNormL2s[i]))
        fileHandle.write('\n')

    fileHandle.close()
    input.close()

    print('Wrote ', 'rowNormL1L2.csv')

    input = open(inputFileName, 'r')
    fileHandle = open(outputFileName, 'w')

    # output normalized data
    numCols = numCols - 1
    # write header row
    if (numCols > 1):
        for j in range(numCols - 1):
            fileHandle.write(hL[j])
            fileHandle.write(',')
    fileHandle.write(hL[numCols - 1])
    fileHandle.write('\n')

    s = input.readline()  # don't repeat header

    for i in range(numRows):
        # copy input to output
        s = input.readline()
        LoS = s.split(',')
        k = 0
        for j in range(numCols - 1):
            if (typeStrings[j] == 'String'):
                fileHandle.write(LoS[j])
            else:
                fileHandle.write('%s' % normData[i][k])
                k = k + 1
            fileHandle.write(',')
        if (typeStrings[numCols - 1] == 'String'):
            fileHandle.write(LoS[numCols - 1])
        else:
            fileHandle.write('%s' % normData[i][k])
        fileHandle.write('\n')

    fileHandle.close()
    input.close()

    print('Wrote ', outputFileName, '\n')

    #******************************************************************************

    # compute median for each column

    medians = [0.0] * numNumericCols
    aCol = [0.0] * numRows

    for j in range(numNumericCols):
        for i in range(numRows):
            aCol[i] = LoLoF[i][j]
        aCol.sort()

        medians[j] = aCol[numRows / 2]

    print('medians:')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            print(medians[k], end=',')
            k = k + 1
    print('\n\n')

    # compute histograms for each column

    numBins = int(math.sqrt(numRows))
    histogram = [0] * (numBins + 1)
    binWidth = 0
    mins = [0.0] * numCols
    maxs = [0.0] * numCols

    print('Computing histograms for numeric columns')
    print('choosing ', numBins, ' bins')

    k = 0

    for j in range(numCols):
        mins[j] = summary.min()[j]
        maxs[j] = summary.max()[j]
        if (typeStrings[j] == 'String'):
            print('column ', j, '( ', hL[j], ' ): Text')
        else:
            binWidth = (maxs[j] - mins[j]) / numBins
            for i in range(numBins):
                histogram[i] = 0
            for i in range(numRows):
                histogram[int((LoLoF[i][k] - mins[j]) / binWidth)] += 1
            print('column ', j, '( ', hL[j], ' ):')
            if (typeStrings[j] == 'NumLabel'):
                print('NumLabel')
            for i in range(numBins):
                print(histogram[i], end=',')
            print()
            k = k + 1
    print('\n\n')

    # compute modes

    modes = [0.0] * numNumericCols
    largestBin = 0
    binIndex = 0

    print('modes:')
    k = 0
    for j in range(numCols):
        if (typeStrings[j] == 'String'):
            print('Text', end=',')
        else:
            largestBin = 0
            binIndex = 0
            for i in range(numBins):
                # pick the bin with most items
                if (histogram[i] > largestBin):
                    binIndex = i
            modes[k] = mins[j] + (maxs[j] - mins[j]) * binIndex / numBins
            print(modes[k], end=',')
            k = k + 1
    print('\n\n')

    return 0
示例#20
0
 def eye(n):
     m = np.eye(n, n)
     m = Matrices.dense(n, n, m.flatten().tolist())
     return m
示例#21
0
    from pyspark.sql import Row
    from pyspark.sql import SQLContext

    print("Successfully imported Spark Modules")
except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)

sc = SparkContext('local')
sqlContext = SQLContext(sc)

#############################################BASIC_DATA_TYPES###########################################################
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))
dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
print('*' * 50, 'BASIC_DATA_TYPES', '*' * 50)
print(pos)
print(neg)
print(dm2)
print(sm)
##############################################MODELS_TRAIN##############################################################


def accuracy(data):
    y = [el[1] for el in data.collect()]
    y_pred = [el[0] for el in data.collect()]
    print('Accuracy:', accuracy_score(y, y_pred=y_pred))

示例#22
0
print(summary.variance())
print(summary.numNonzeros())
print(summary.max())
print(summary.min())
print(summary.count())
print(summary.normL1())
print(summary.normL2())

#correlation
x = sc.parallelize(np.random.randn(4, 1))
y = sc.parallelize(np.random.randn(4, 1))
print("Correlation :", str(Statistics.corr(x, y)))

#Chi-square
#For Vector
x = Vectors.dense(np.random.random_sample((5)))
y = Vectors.dense(np.random.random_sample((5)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)

# For Matrices
x = Matrices.dense(4, 2, np.random.random_sample((8)))
y = Matrices.dense(4, 2, np.random.random_sample((8)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)
示例#23
0
def test_from_matrix():
    mat = Matrices.dense(1, 2, [13, 37])
    x = from_matrix(mat)
    assert x.shape == (1, 2)
示例#24
0
文件: data_type.py 项目: ljldgup/ml
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# 生成label 和 features 行
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

from pyspark.mllib.linalg import Matrix, Matrices

# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])

# 稀疏矩阵,尺寸,各个索引(数量,长度和尺寸对应),值
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
示例#25
0
#matrix = Matrices.dense(nrows, ncols, rdd)
print("ncol: %d, nrow %d" % (ncols, nrows))
coord_mat = CoordinateMatrix(rdd.map(tuple))
print("num rows in matrix %d" % coord_mat.numRows())

print("finished using pyspark")
#________________________________________________-

print("now use SparkSession")

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx',
                                               header=False,
                                               inferSchema=True)
df_2.printSchema()

#coord_mat_2 = CoordinateMatrix(df_2.rdd.map(tuple))
row_mat = RowMatrix(df_2.rdd.map(tuple))
print("num rows in row matrix %d, num_cols %d" %
      (row_mat.numRows(), row_mat.numCols()))

print("print covariance")
print(row_mat.computeCovariance())

dm = Matrices.dense(3, 1, [4, 5, 6])

print("multiply row Matrix")
result = row_mat.multiply(dm)
示例#26
0
 def denseBlock(x):
     adj = x[1].toarray()
     G_dm = Matrices.dense(block_size, block_size, adj.transpose().flatten())
     return (x[0], G_dm)
示例#27
0
def revised_simplex(h, f, b, m, n, basis, nonbasis):
    k = True
    B = f[:, basis]
    B = linalg.inv(B)
    Pxbb = B.dot(b).flatten()


    counter = 0
    while k:


        counter = counter + 1
        print counter

        cD = h[nonbasis]
        cB = h[basis]
        # print f
        # print basis

        B = f[:, basis]
        #print 'Basis of A with transpose'
        #print B.transpose()
        B = linalg.inv(B)
        #Pxbb = B.dot(b).flatten()
        #print 'inverse of B'
        #print B
        D = f[:, nonbasis]
        bs = Matrices.dense(m, 1, b.flatten().tolist())
        blocks0 = sc.parallelize([((0, 0), bs)])
        mat0 = BlockMatrix(blocks0, m, 1)

        dm1 = Matrices.dense(m, m, f[:, basis].flatten().tolist())  # A matrix basis indices chosen
        blocks1 = sc.parallelize([((0, 0), dm1)])
        mat1 = BlockMatrix(blocks1, m, m)
        mat1 = mat1.transpose()
        mat1.toLocalMatrix()
        # print mat1.toLocalMatrix()

        mat2 = IndexedRowMatrix(sc.parallelize(enumerate(f[:, nonbasis]))).toBlockMatrix(rowsPerBlock=m, colsPerBlock=n)
        # print (mat2.toLocalMatrix())

        G = mat1.toLocalMatrix()  # G is basis stored
        K = mat2.toLocalMatrix()

        # print (G)  # It will display Basis Matrix
        # print (K)

        dm2 = Matrices.dense(m, m, B.flatten().tolist())  # Inverse stored in dm2
        blocks2 = sc.parallelize([((0, 0), dm2)])  # Inverse B converted to blocks
        mat3 = BlockMatrix(blocks2, m, m)

        mat3 = mat3.transpose()
        L = mat3.toLocalMatrix()
        dm3 = Matrices.dense(1, m, h[basis].tolist())  # Cost vector C, basis stored in dm3
        blocks4 = sc.parallelize([((0, 0), dm3)])  # 'c' basis is stored in blocks4
        mat4 = BlockMatrix(blocks4, 1, m)  # 'c' stored as BlockMatrix
        S = mat4.toLocalMatrix()
        # print (S)

        dm4 = Matrices.dense(1, n, h[nonbasis].tolist())  # Cost vector C, non-basis stored in dm5
        blocks5 = sc.parallelize([((0, 0), dm4)])  # 'c' non-basis is stored in blocks5
        mat6 = BlockMatrix(blocks5, 1, n)  # 'c' stored as BlockMatrix
        R = mat6.toLocalMatrix()

        # print (R)

        La = mat4.multiply(mat3).toLocalMatrix()  # c is basis matrix, multiply by matrix B inverse. In main program it is "l = cB.dot(B)"
        # print (La)

        blocks6 = sc.parallelize([((0, 0),
                                   La)])  # this step is done to store La in mat variable so that it would be easy to use it for further multiplication
        mat7 = BlockMatrix(blocks6, 1,
                           m)  # from main program "l = cB.dot(B)" is stored in "mat 7" for future multiplication
        Sa = mat7.toLocalMatrix()
        # print (Sa)

        ga = mat7.multiply(
            mat2).toLocalMatrix()  # multiply "l = cB.dot(B)" by 'D' where 'D' is Matrix A's non basis. Here 'mat3'

        # print (ga)

        blocks7 = sc.parallelize([((0, 0), ga)])  # this step is done to store 'ga' in mat8
        mat8 = BlockMatrix(blocks7, 1, n)

        Cd = mat6.subtract(mat8).toLocalMatrix()

        #print 'mat6='
        #print mat6.toLocalMatrix()

        #print 'mat7'
        #print mat7.toLocalMatrix()

        #print 'mat2'
        #print mat2.transpose().toLocalMatrix()

        #print 'mat4'
        #print mat4.toLocalMatrix()

        #print 'mat3'
        #print mat3.toLocalMatrix()

        #print 'mat8='
        #print mat8.toLocalMatrix()

        ma = Cd.toArray()
        # maa = np.around(ma, decimals= 10)
        print 'ma ='
        print ma

        # print "printing Cd"
        minrD = np.argmin(ma)
        #print 'minimum index of maa is'

        print (minrD)

        do = minrD  # We get value 0

        Dxx = D[:, do]

        Dx = Matrices.dense(m, 1,
                            Dxx.tolist())  # the index of minimum of minrD is used to call matrix D's elements which we will parallelize
        blocks8 = sc.parallelize([((0, 0), Dx)])  # store Dx it in blocks8
        mat9 = BlockMatrix(blocks8, m, 1)  # Convert to blockmatrix and store in mat9
        Aa = mat9.toLocalMatrix()

        Pa = mat3.multiply(
            mat9).toLocalMatrix()  # Inverse of B multiply by Dx( where Dx = D[:, n] where D = A[:, nonbasis]
        Pxb = mat3.multiply(mat0).toLocalMatrix()

        #print (Pa)
        #print (Pxb)
        Paa = B.dot(Dxx)
        # Pxbb = B.dot(b)

        # Paaa = np.around(Paa, decimals= 16)
        # Pxbbb = np.around(Pxbb, decimals=16)


        print 'This is Paa'
        print Paa


        # abc = np.divide(Pxbb, Paa)
        # print (abc)
        # with np.errstate(divide='ignore'):

        abc = inf * np.ones(len(Pxbb))
        abcd = inf * np.ones(len(Pxbb))
        # print 'len(Paa) is'
        # print len(Paa) - 1
        for idx in range(0, len(Paa)):
            # print idx

           if Paa[idx] > 1e-12:
             abc[idx] = Pxbb[idx] / Paa[idx]

             print 'this is Pxbb before update '
             print Pxbb

        Qa = np.argmin(abc)
        #Qa = np.argmin(abc[np.nonzero(abc)])
        Pxbb = Pxbb - np.multiply(np.amin(abc), Paa).transpose()
        print np.multiply(np.amin(abc), Paa)


        Pxbb[Qa] = np.amin(abc)

        #for idx in range(0, len(Paa)):
            #if Paa[idx] > 0:
                #abcd[idx] = Pxbb[idx] / Paa[idx]

        print 'this is Paa after update'
        print Paa


        print 'this is Pxbb after updating'
        print Pxbb

        print 'abc with updated Pxbb'
        print abc



        #Qc = np.argmin(abcd[np.nonzero(abcd)])

        #print 'do = The leaving variable index'
        #print do

        #print 'np.argmin(abc) is the entering variable index'
        #print Qa

        #print 'printing nonbasis do'
        #print nonbasis[do]

        object = h[basis]

        print 'printing Qa='
        print Qa

        final = basis
        k = np.any(ma < -0.00000000001)
        if k == False:
            break

        temp = basis[Qa]
        basis[Qa] = nonbasis[do]
        nonbasis[do] = temp

        #print 'Cd ='
        #print (Cd)

        print 'nonbasis ='
        print nonbasis

        print 'basis ='
        print basis

        # print shape(basis)

        #ma = Cd.toArray()

        #print 'ma ='
        #print ma

        # print k
        # print 'Pxbb ='
        #print type(Pxbb)

    zzz = np.inner(h[basis], Pxbb)

    solution = [zzz, basis, Pxbb]
    return solution
示例#28
0

if __name__ == "__main__":

    global rows
    global mat
    rows = 3

    sc = SparkContext("local", "Determinant")

    # accumulator variable to accumulate final determinant value
    det = sc.accumulator(0)

    # dense matrix returns matrix in column major format hence
    # the entered values itself is fiven in column major so that
    # we can finally have a  row major matrix to operate on
    dm2 = Matrices.dense(rows, rows, [2, 7, 3, 3, 7, 8, 5, 8, 5])

    print "\n\nEntered matrix:\n", dm2.toArray()

    #here we are trying to divide work between workers. we divide first row
    # between them (calculate partial determinant for each item in first row)
    cols = sc.parallelize([i for i in range(0, rows)])

    mat = dm2.toArray()

    cols.foreach(dist_deter)

    # print "determinant", determinant(3,dm2) #to check correctness
    print "\n The determinant is:", det.value
示例#29
0
fractions = {1: 0.1, 2: 0.6, 3: 0.3}
approxSample = data.sampleByKey(False, fractions)

## hypothesis testing
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics

vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)
# compute goodness of fit. either compare two vectors to each other or compare one vector to a uniform distribution
goodnessOfFitTestResults = Statistics.chiSqTest(vec)
print(goodnessOfFitTestResults)

# pearson's independence test on a matrix
mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])
independenceTestResults = Statistics.chiSqTest(mat)
print(independenceTestResults)

# a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns
# a Chi-squared test results for every feature against the label
obs = sc.parallelize([
    LabeledPoint(1.0, [1.0, 0.0, 3.0]),
    LabeledPoint(1.0, [1.0, 2.0, 0.0]),
    LabeledPoint(1.0, [-1.0, 0.0, -0.5])
])
featureTestResults = Statistics.chiSqTest(obs)

for i, result in enumerate(featureTestResults):
    print('column {0}: \n {1}'.format(i, result))
示例#30
0
def npToDenseMat(ndArr):
    m, n = ndArr.shape
    return Matrices.dense(m, n, ndArr.transpose().flatten())
示例#31
0
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        if self._pandas_flag:
            if self._data_frame[testMeasure].dtypes == 'int64':
                measureSummaryDict = dict(
                    self._data_frame[testMeasure].describe())
                if float(measureSummaryDict["count"]) > 10:
                    maxval = int(measureSummaryDict["max"])
                    minval = int(measureSummaryDict["min"])
                    step = (maxval - minval) / 5.0
                    splits = [
                        round(math.floor(minval)),
                        round(minval + step),
                        round(minval + (step * 2)),
                        round(minval + (step * 3)),
                        round(minval + (step * 4)),
                        round(math.ceil(maxval))
                    ]
                    splits = list(set(splits))
                    splits.sort()
                    self._data_frame['bucketedColumn'] = pd.cut(
                        self._data_frame[testMeasure],
                        bins=splits,
                        labels=list(range(len(splits) - 1)),
                        retbins=True,
                        right=False)[0]
                    self._data_frame = self._data_frame.dropna()
                    pivot_table = pd.crosstab(
                        [self._data_frame[targetDimension]],
                        self._data_frame['bucketedColumn'])
                else:
                    pivot_table = pd.crosstab(
                        [self._data_frame[targetDimension]],
                        self._data_frame[testMeasure])

            else:
                df = self._data_frame
                if [df[testMeasure].dtypes == 'float64']:
                    measureSummaryDict = dict(df[testMeasure].describe())
                    if float(measureSummaryDict["count"]) > 10:
                        maxval = float(measureSummaryDict["max"])
                        minval = float(measureSummaryDict["min"])
                        step = (maxval - minval) / 5.0
                        splits = [
                            math.floor(minval), minval + step,
                            minval + (step * 2), minval + (step * 3),
                            minval + (step * 4),
                            math.ceil(maxval)
                        ]
                        df['bucketedColumn'] = pd.cut(
                            df[testMeasure],
                            bins=splits,
                            labels=list(range(len(splits) - 1)),
                            retbins=True,
                            right=False)[0]
                        df = df.dropna()
                        pivot_table = pd.crosstab([df[targetDimension]],
                                                  df['bucketedColumn'])
                    else:
                        pivot_table = pd.crosstab([df[targetDimension]],
                                                  df[testMeasure])
        else:
            dtype = self._data_frame.schema[testMeasure].dataType
            if dtype is IntegerType():
                # df = self._data_frame.withColumn(testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
                measureSummaryDict = dict(
                    self._data_frame.describe([testMeasure]).toPandas().values)
                if float(measureSummaryDict["count"]) > 10:
                    maxval = int(measureSummaryDict["max"])
                    minval = int(measureSummaryDict["min"])
                    step = (maxval - minval) / 5.0
                    splits = [
                        round(math.floor(minval)),
                        round(minval + step),
                        round(minval + (step * 2)),
                        round(minval + (step * 3)),
                        round(minval + (step * 4)),
                        round(math.ceil(maxval))
                    ]
                    splits = list(set(splits))
                    splits.sort()
                    bucketizer = Bucketizer(splits=splits,
                                            inputCol=testMeasure,
                                            outputCol="bucketedColumn")
                    # bucketedData = bucketizer.transform(df)
                    bucketedData = bucketizer.transform(
                        self._data_frame.na.drop(subset=testMeasure))
                    pivot_table = bucketedData.stat.crosstab(
                        "{}".format(targetDimension), 'bucketedColumn')
                    keshav = pivot_table.toPandas()
                else:
                    pivot_table = self._data_frame.stat.crosstab(
                        "{}".format(targetDimension), testMeasure)
            else:
                df = self._data_frame.withColumn(
                    testMeasure,
                    self._data_frame[testMeasure].cast(DoubleType()))
                measureSummaryDict = dict(
                    df.describe([testMeasure]).toPandas().values)
                if float(measureSummaryDict["count"]) > 10:
                    maxval = float(measureSummaryDict["max"])
                    minval = float(measureSummaryDict["min"])
                    step = (maxval - minval) / 5.0
                    splits = [
                        math.floor(minval), minval + step, minval + (step * 2),
                        minval + (step * 3), minval + (step * 4),
                        math.ceil(maxval)
                    ]
                    bucketizer = Bucketizer(splits=splits,
                                            inputCol=testMeasure,
                                            outputCol="bucketedColumn")
                    # bucketedData = bucketizer.transform(df)
                    bucketedData = bucketizer.transform(
                        df.na.drop(subset=testMeasure))
                    pivot_table = bucketedData.stat.crosstab(
                        "{}".format(targetDimension), 'bucketedColumn')
                else:
                    pivot_table = df.stat.crosstab(
                        "{}".format(targetDimension), testMeasure)
        if self._pandas_flag:
            try:
                data_matrix = np.array(
                    pivot_table.as_matrix(columns=None)).astype(np.int)
            except:
                data_matrix = np.array(pivot_table.values).astype(np.int)
        else:
            rdd = list(
                chain(*list(
                    zip(*pivot_table.drop(pivot_table.columns[0]).collect()))))
            data_matrix = Matrices.dense(pivot_table.count(),
                                         len(pivot_table.columns) - 1, rdd)
            data_matrix = data_matrix.toArray().tolist()
        result = chi2_contingency(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result[0]
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result
示例#32
0
if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3,
                        0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(
        3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize([
        LabeledPoint(1.0, [1.0, 0.0, 3.0]),
        LabeledPoint(1.0, [1.0, 2.0, 0.0]),
        LabeledPoint(1.0, [-1.0, 0.0, -0.5])
    ])  # LabeledPoint(label, feature)

    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
示例#33
0
from pyspark.mllib.linalg import Vectors, Matrices
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics

sc = SparkContext("local", "Rubbish")
"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4, 5, 0, 3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [
    40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0
]
matrix = Matrices.dense(3, 4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)

# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult
示例#34
0
from __future__ import print_function

#Section 7.2.1
from pyspark.mllib.linalg import Vectors, Vector
dv1 = Vectors.dense(5.0,6.0,7.0,8.0)
dv2 = Vectors.dense([5.0,6.0,7.0,8.0])
sv = Vectors.sparse(4, [0,1,2,3], [5.0,6.0,7.0,8.0])
dv2[2]
dv1.size
dv2.toArray()

from pyspark.mllib.linalg import Matrices

dm = Matrices.dense(2,3,[5.0,0.0,0.0,3.0,1.0,4.0])
sm = Matrices.sparse(2,3,[0,1,2,4], [0,1,0,1], [5.0,3.0,1.0,4.0])
sm.toDense()
dm.toSparse()
dm[1,1]

#Section 7.2.2
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow
rmind = IndexedRowMatrix(rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0])))

#Section 7.4
housingLines = sc.textFile("first-edition/ch07/housing.data", 6)
housingVals = housingLines.map(lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")]))

#Section 7.4.1
from pyspark.mllib.linalg.distributed import RowMatrix
housingMat = RowMatrix(housingVals)
from pyspark.mllib.stat._statistics import Statistics
示例#35
0
# coding=utf-8

from pyspark.mllib.linalg import Vectors, SparseVector, Matrix, Matrices

# 本地矩阵
dm = Matrices.dense(2, 2, [2, 3, 4, 5])
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

print('dm')
print(dm.toArray())
print('sm')
print(sm.toDense())
示例#36
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics


sc = SparkContext("local", "Rubbish")

"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4,5,0,3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
matrix = Matrices.dense(3,4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)


# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult 
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult
示例#37
0
model_test_dfi = pipeline.fit(test_dfi)
result_test_dfi = model.transform(test_dfi)

#cv = CountVectorizer(inputCol="stopRemove", outputCol="features")
#model = cv.fit(dataset)
#result = model.transform(dataset)
#result.show(truncate=False)

array  = [0]*20
print(array)
mvv = result.select("movie_name").rdd.flatMap(lambda x: x).collect()
print(mvv)
from pyspark.mllib.linalg import Matrix, Matrices

# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 4, [1, 2, 3, 4, 5, 6,7,8,9,10,11,12])
dm3 = Matrices.dense(1, 12, [1, 2, 3, 4, 5, 6,7,8,9,10,11,12])
print(dm2)
print(dm3)
#dm4 = dm2+dm3
#print(dm4)

from pyspark.sql.functions import col
  mapping = {}
  #train_data, test_data = result.randomSplit([1.0,0.0], seed=100)
  train_data = result
  train_data.printSchema()
  #df_new = mapping_df.rename(columns={'_c0': 'A'})
  mapping_df1 = mapping_df.select(col("_c0").alias("indexes"),col("0").alias("genres"))
  mapping_df1.show()
  #print(range(mapping_df1.collect()))