Пример #1
0
def PCAdata(df, num):
    Label = df.map(lambda p: p.label).zipWithIndex().map(lambda (label, index):
                                                         (index, label))
    Features = df.map(lambda p: p.features)
    pcaModel = PCA(num).fit(Features)
    projected = pcaModel.transform(Features)
    second = projected.zipWithIndex().map(lambda (features, index):
                                          (index, features))
    result = Label.join(second).map(
        lambda (idx, (label, features)): LabeledPoint(label, features))
    return result
def pca_fit(parsed_Data):
    x = parsed_Data.map(lambda p: p.features)
    pc = PCA(5).fit(x)
    transformed = pc.transform(x)
    y = parsed_Data.map(lambda p: p.label)
    a = transformed.zip(y)
    paired = a.map(lambda line: LabeledPoint(line[1], line[0]))

    rdd2 = paired.randomSplit([0.8, 0.2])
    model2 = LinearRegressionWithSGD.train(rdd2[0], iterations=100,
                                           step=0.00000001, regType=None)

    # Evaluate the model on training data
    valuesAndPreds = rdd2[1].map(lambda p: (p.label, model2.predict(p.features)))
    MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
Пример #3
0
def run_pca(sc):
    cpu_count = multiprocessing.cpu_count()
    cluster_loss = dict()

    for n in range(0, CLUSTERS):
        filename = "cluster_" + str(n) + ".csv"
        cl_file = CLUSTER_PATH + filename
        dataset = sc.textFile(cl_file, cpu_count)
        dataset = dataset.map(
            lambda line: Vectors.dense([float(x) for x in line.split(';')]))

        model = PCA(2).fit(dataset)
        transformed = model.transform(dataset)
        transformed_csv = transformed.map(
            lambda x: ';'.join(list(map(str, x))))
        transformed_csv.coalesce(1).saveAsTextFile(PCA_PATH +
                                                   "onehot_%s" % filename)
Пример #4
0
mbr = ModelBuildReporter(sc)

# create a DataModelTools to handle data model and data conversions
dmt = DataModelTools()

# compute the data model from the dataframe
# data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields
datamodel = dmt.computeDataModel(df.select(*predictors))
# use DataModelTools to convert from DataFrame to an RDD of DenseVector for specified predictors
lp = dmt.extractDenseVector(df, predictors,
                            setToFlag=1.0).map(lambda x: x[1]).cache()

# build the PCA Model
from pyspark.mllib.feature import PCA

estimator = PCA(k_param)
pcamodel = estimator.fit(lp)

# extract the model coefficients by creating dummy data with each row containing
# a predictor set to 1 and all others set to 0 (python wrapper does not seem to provide direct access in Spark 1.5)
coefficients = []
n_predictors = len(predictors)
for i in range(0, k_param):
    coefficients.append(list([0.0] * n_predictors))

for c in range(0, len(predictors)):
    vec = [0.0] * len(predictors)
    vec[c] = 1.0
    arr = pcamodel.transform(DenseVector(vec)).toArray()
    for i in range(0, k_param):
        coefficients[i][c] = arr[i]
Пример #5
0
""""
Program: PCA
Description: 调用spark内置的PCA算法
Author: zhenglei - [email protected]
Date: 2016-01-14 13:45:02
# Last modified: 2016-01-28 19:23:14
Python release: 2.7
"""
# 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.feature import PCA
from pyspark.mllib.linalg import Vectors

if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('pcaTestSet.txt')
    datas = tmpdatas.map(lambda line: Vectors.dense(
        array([float(line.split('\t')[0]),
               float(line.split('\t')[1])])))
    print datas.collect()[0]

    # 将输入降维成1维数据,并测试降维模型的准确性
    model = PCA(1).fit(datas)
    transforms = model.transform(datas)
    print transforms.collect()[0], array(transforms.collect()).shape

    # 测试输入[10.235186,11.321997]之后的降维值
    print model.transform(array([10.235186, 11.321997]))
    sc.stop()
Пример #6
0
result_collection["review_counts_helpful%"] = review_counts_hp

# Get average overall rating per review length (char count)
review_length_cc, = averages_per_key(reviewer_vectors, lambda x:
                                     (x[1][4], [x[1][2]]))
review_length_wc, = averages_per_key(reviewer_vectors, lambda x:
                                     (x[1][6], [x[1][2]]))

result_collection["review_length_char_count"] = review_length_cc
result_collection["review_length_word_count"] = review_length_wc

# Conduct PCA
reviewer_vectors_real = reviewer_vectors.map(
    lambda x: Vectors.dense([val for val in x[1]]))

pca_model = PCA(8).fit(reviewer_vectors_real)
transformed = pca_model.transform(reviewer_vectors_real)

current_best = None
current_best_cost = float("inf")

# Run K-Means
for k in range(2, 70, 7):
    kmeans_model = KMeans.train(transformed, k, maxIterations=100, runs=10)

    cost = kmeans_model.computeCost(transformed)

    if cost < current_best_cost:
        current_best_cost = cost
        current_best = kmeans_model
    # LOADING AND COMPUTING TF's TRAINING MODEL
    print('Loading TRAINING_TF_MODEL...')
    tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' +
                                str(feature_dim))
    print('done!')

    print('Computing TF-IDF MODEL...')
    idf_training = IDF(minDocFreq=5).fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    print('done!')

    # APPLYING PCA ON TRAINING DATA
    if pca_mode.value == 1:
        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
    trainingQueue = [tfidf_training]
    trainingStream = ssc.queueStream(trainingQueue)

    # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND
    model = StreamingKMeans(k=2, decayFactor=1.0,
                            timeUnit='batches').setRandomCenters(k, 1.0, 0)
Пример #8
0
    #
    #
    # '''
    # The A.cartesian(B) will be an RDD of the form:
    # [(A ID1, A String1), (A ID2, A String2), ...]  and  [(B ID1, B String1), (B ID2, B String2), ...]
    # to:
    # [ ((A ID1, A String1), (B ID1, B String1)), ((A ID1, A String1), (B ID2, B String2)), ((A URL2, A String2), (B ID1, B String1)), ... ]¶
    # '''
    # cross_RDD = ID_tokens.cartesian(ID_tokens).cache()
    # # commonTokens:  [[id1, id2], [tokens]]
    # commonTokens = cross_RDD.map(get_common)
    # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache()
    #
    # end = time.time()
    # print 'total prepare: '+ str(end - start)
    # print similarities_RDD.count()
    # c_time = time.time()
    # print 'count time: ' + str(c_time - end)
    # similarities_RDD.collect()
    # c2_time = time.time()
    # print 'count time: ' + str(c2_time - c_time)
    # print 'Successfully Calculated the similarities between all the posts'


if __name__ == '__main__':
    sc = SparkContext('local')
    tfidf_matrix = create_tfidf(sc)
    tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row))
    reduc = PCA(3).fit(tfidf_dVector_matrix)
    after_pca = reduc.transform(tfidf_dVector_matrix)