def PCAdata(df, num): Label = df.map(lambda p: p.label).zipWithIndex().map(lambda (label, index): (index, label)) Features = df.map(lambda p: p.features) pcaModel = PCA(num).fit(Features) projected = pcaModel.transform(Features) second = projected.zipWithIndex().map(lambda (features, index): (index, features)) result = Label.join(second).map( lambda (idx, (label, features)): LabeledPoint(label, features)) return result
def pca_fit(parsed_Data): x = parsed_Data.map(lambda p: p.features) pc = PCA(5).fit(x) transformed = pc.transform(x) y = parsed_Data.map(lambda p: p.label) a = transformed.zip(y) paired = a.map(lambda line: LabeledPoint(line[1], line[0])) rdd2 = paired.randomSplit([0.8, 0.2]) model2 = LinearRegressionWithSGD.train(rdd2[0], iterations=100, step=0.00000001, regType=None) # Evaluate the model on training data valuesAndPreds = rdd2[1].map(lambda p: (p.label, model2.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE))
def run_pca(sc): cpu_count = multiprocessing.cpu_count() cluster_loss = dict() for n in range(0, CLUSTERS): filename = "cluster_" + str(n) + ".csv" cl_file = CLUSTER_PATH + filename dataset = sc.textFile(cl_file, cpu_count) dataset = dataset.map( lambda line: Vectors.dense([float(x) for x in line.split(';')])) model = PCA(2).fit(dataset) transformed = model.transform(dataset) transformed_csv = transformed.map( lambda x: ';'.join(list(map(str, x)))) transformed_csv.coalesce(1).saveAsTextFile(PCA_PATH + "onehot_%s" % filename)
mbr = ModelBuildReporter(sc) # create a DataModelTools to handle data model and data conversions dmt = DataModelTools() # compute the data model from the dataframe # data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields datamodel = dmt.computeDataModel(df.select(*predictors)) # use DataModelTools to convert from DataFrame to an RDD of DenseVector for specified predictors lp = dmt.extractDenseVector(df, predictors, setToFlag=1.0).map(lambda x: x[1]).cache() # build the PCA Model from pyspark.mllib.feature import PCA estimator = PCA(k_param) pcamodel = estimator.fit(lp) # extract the model coefficients by creating dummy data with each row containing # a predictor set to 1 and all others set to 0 (python wrapper does not seem to provide direct access in Spark 1.5) coefficients = [] n_predictors = len(predictors) for i in range(0, k_param): coefficients.append(list([0.0] * n_predictors)) for c in range(0, len(predictors)): vec = [0.0] * len(predictors) vec[c] = 1.0 arr = pcamodel.transform(DenseVector(vec)).toArray() for i in range(0, k_param): coefficients[i][c] = arr[i]
"""" Program: PCA Description: 调用spark内置的PCA算法 Author: zhenglei - [email protected] Date: 2016-01-14 13:45:02 # Last modified: 2016-01-28 19:23:14 Python release: 2.7 """ # 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理 from numpy import array from pyspark import SparkContext from pyspark.mllib.feature import PCA from pyspark.mllib.linalg import Vectors if __name__ == '__main__': sc = SparkContext() tmpdatas = sc.textFile('pcaTestSet.txt') datas = tmpdatas.map(lambda line: Vectors.dense( array([float(line.split('\t')[0]), float(line.split('\t')[1])]))) print datas.collect()[0] # 将输入降维成1维数据,并测试降维模型的准确性 model = PCA(1).fit(datas) transforms = model.transform(datas) print transforms.collect()[0], array(transforms.collect()).shape # 测试输入[10.235186,11.321997]之后的降维值 print model.transform(array([10.235186, 11.321997])) sc.stop()
result_collection["review_counts_helpful%"] = review_counts_hp # Get average overall rating per review length (char count) review_length_cc, = averages_per_key(reviewer_vectors, lambda x: (x[1][4], [x[1][2]])) review_length_wc, = averages_per_key(reviewer_vectors, lambda x: (x[1][6], [x[1][2]])) result_collection["review_length_char_count"] = review_length_cc result_collection["review_length_word_count"] = review_length_wc # Conduct PCA reviewer_vectors_real = reviewer_vectors.map( lambda x: Vectors.dense([val for val in x[1]])) pca_model = PCA(8).fit(reviewer_vectors_real) transformed = pca_model.transform(reviewer_vectors_real) current_best = None current_best_cost = float("inf") # Run K-Means for k in range(2, 70, 7): kmeans_model = KMeans.train(transformed, k, maxIterations=100, runs=10) cost = kmeans_model.computeCost(transformed) if cost < current_best_cost: current_best_cost = cost current_best = kmeans_model
# LOADING AND COMPUTING TF's TRAINING MODEL print('Loading TRAINING_TF_MODEL...') tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' + str(feature_dim)) print('done!') print('Computing TF-IDF MODEL...') idf_training = IDF(minDocFreq=5).fit(tf_training) tfidf_training = idf_training.transform(tf_training) print('done!') # APPLYING PCA ON TRAINING DATA if pca_mode.value == 1: print('Applying PCA on training data...') PCA_model = PCA(low_dim).fit(tfidf_training) tfidf_training = PCA_model.transform(tfidf_training) k = low_dim # pcArray = model.transform(tfidf_training.first()).toArray() #setting checkpoint # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint") # CREATING DStream FROM TRAINING'S RDD trainingQueue = [tfidf_training] trainingStream = ssc.queueStream(trainingQueue) # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND model = StreamingKMeans(k=2, decayFactor=1.0, timeUnit='batches').setRandomCenters(k, 1.0, 0)
# # # ''' # The A.cartesian(B) will be an RDD of the form: # [(A ID1, A String1), (A ID2, A String2), ...] and [(B ID1, B String1), (B ID2, B String2), ...] # to: # [ ((A ID1, A String1), (B ID1, B String1)), ((A ID1, A String1), (B ID2, B String2)), ((A URL2, A String2), (B ID1, B String1)), ... ]¶ # ''' # cross_RDD = ID_tokens.cartesian(ID_tokens).cache() # # commonTokens: [[id1, id2], [tokens]] # commonTokens = cross_RDD.map(get_common) # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache() # # end = time.time() # print 'total prepare: '+ str(end - start) # print similarities_RDD.count() # c_time = time.time() # print 'count time: ' + str(c_time - end) # similarities_RDD.collect() # c2_time = time.time() # print 'count time: ' + str(c2_time - c_time) # print 'Successfully Calculated the similarities between all the posts' if __name__ == '__main__': sc = SparkContext('local') tfidf_matrix = create_tfidf(sc) tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row)) reduc = PCA(3).fit(tfidf_dVector_matrix) after_pca = reduc.transform(tfidf_dVector_matrix)