def PCAdata(df, num): Label = df.map(lambda p: p.label).zipWithIndex().map(lambda (label, index): (index, label)) Features = df.map(lambda p: p.features) pcaModel = PCA(num).fit(Features) projected = pcaModel.transform(Features) second = projected.zipWithIndex().map(lambda (features, index): (index, features)) result = Label.join(second).map( lambda (idx, (label, features)): LabeledPoint(label, features)) return result
def pca_fit(parsed_Data): x = parsed_Data.map(lambda p: p.features) pc = PCA(5).fit(x) transformed = pc.transform(x) y = parsed_Data.map(lambda p: p.label) a = transformed.zip(y) paired = a.map(lambda line: LabeledPoint(line[1], line[0])) rdd2 = paired.randomSplit([0.8, 0.2]) model2 = LinearRegressionWithSGD.train(rdd2[0], iterations=100, step=0.00000001, regType=None) # Evaluate the model on training data valuesAndPreds = rdd2[1].map(lambda p: (p.label, model2.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE))
def run_pca(sc): cpu_count = multiprocessing.cpu_count() cluster_loss = dict() for n in range(0, CLUSTERS): filename = "cluster_" + str(n) + ".csv" cl_file = CLUSTER_PATH + filename dataset = sc.textFile(cl_file, cpu_count) dataset = dataset.map( lambda line: Vectors.dense([float(x) for x in line.split(';')])) model = PCA(2).fit(dataset) transformed = model.transform(dataset) transformed_csv = transformed.map( lambda x: ';'.join(list(map(str, x)))) transformed_csv.coalesce(1).saveAsTextFile(PCA_PATH + "onehot_%s" % filename)
print rdd_b.count() print rdd_b.take(1) # # Profiles standardisation # new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b) print type(new_scalar) scaler3 = new_scalar.transform(rdd_b) # # Profiles compression with PCA # model = PCAmllib(10).fit(scaler3) print type(model) transformed = model.transform(scaler3) print type(transformed) print transformed.count() print transformed.first() # # Train a Profiles classification model with KMean # NBCLUSTERS = 8 INITMODE = 'kmean||' # kmean|| or random clusters = mllibKMeans.train(transformed, NBCLUSTERS, maxIterations=100, initializationMode=INITMODE) # Rq: Option "runs=5" has been deprecated in 1.6.0
data = sc.textFile( "hdfs://master:9000/root/pyspark_test/iris_data.txt") #for hadoop yarn parsedData = data.map(lambda line: array([x for x in line.split(',')])) first_data = parsedData.take(1)[0] data_row = len(first_data) #include many input and one output attributes params_only = parsedData.map( lambda x: Vectors.dense(np.float_(x[0:(data_row - 1)]))) #params_only.take(5) #the type of params_only is pyspark.rdd.PipelinedRDD #params_only=parsedData.map(lambda x: array(np.float_(x[0:(data_row-1)]))) model_test = PCAmllib(2).fit(params_only) transformed = model_test.transform(params_only) #transformed.collect() pca_2d = transformed.collect() # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) k = 3 clusters = KMeans.train(params_only, k, maxIterations=10,
# -*- coding:utf-8 -*- """" Program: PCA Description: 调用spark内置的PCA算法 Author: zhenglei - [email protected] Date: 2016-01-14 13:45:02 # Last modified: 2016-01-28 19:23:14 Python release: 2.7 """ # 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理 from numpy import array from pyspark import SparkContext from pyspark.mllib.feature import PCA from pyspark.mllib.linalg import Vectors if __name__ == '__main__': sc = SparkContext() tmpdatas = sc.textFile('pcaTestSet.txt') datas = tmpdatas.map(lambda line: Vectors.dense( array([float(line.split('\t')[0]), float(line.split('\t')[1])]))) print datas.collect()[0] # 将输入降维成1维数据,并测试降维模型的准确性 model = PCA(1).fit(datas) transforms = model.transform(datas) print transforms.collect()[0], array(transforms.collect()).shape # 测试输入[10.235186,11.321997]之后的降维值 print model.transform(array([10.235186, 11.321997])) sc.stop()
"""" Program: PCA Description: 调用spark内置的PCA算法 Author: zhenglei - [email protected] Date: 2016-01-14 13:45:02 # Last modified: 2016-01-28 19:23:14 Python release: 2.7 """ # 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理 from numpy import array from pyspark import SparkContext from pyspark.mllib.feature import PCA from pyspark.mllib.linalg import Vectors if __name__ == '__main__': sc = SparkContext() tmpdatas = sc.textFile('pcaTestSet.txt') datas = tmpdatas.map(lambda line: Vectors.dense( array([float(line.split('\t')[0]), float(line.split('\t')[1])]))) print datas.collect()[0] # 将输入降维成1维数据,并测试降维模型的准确性 model = PCA(1).fit(datas) transforms = model.transform(datas) print transforms.collect()[0], array(transforms.collect()).shape # 测试输入[10.235186,11.321997]之后的降维值 print model.transform(array([10.235186, 11.321997])) sc.stop()
# # # ''' # The A.cartesian(B) will be an RDD of the form: # [(A ID1, A String1), (A ID2, A String2), ...] and [(B ID1, B String1), (B ID2, B String2), ...] # to: # [ ((A ID1, A String1), (B ID1, B String1)), ((A ID1, A String1), (B ID2, B String2)), ((A URL2, A String2), (B ID1, B String1)), ... ]¶ # ''' # cross_RDD = ID_tokens.cartesian(ID_tokens).cache() # # commonTokens: [[id1, id2], [tokens]] # commonTokens = cross_RDD.map(get_common) # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache() # # end = time.time() # print 'total prepare: '+ str(end - start) # print similarities_RDD.count() # c_time = time.time() # print 'count time: ' + str(c_time - end) # similarities_RDD.collect() # c2_time = time.time() # print 'count time: ' + str(c2_time - c_time) # print 'Successfully Calculated the similarities between all the posts' if __name__ == '__main__': sc = SparkContext('local') tfidf_matrix = create_tfidf(sc) tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row)) reduc = PCA(3).fit(tfidf_dVector_matrix) after_pca = reduc.transform(tfidf_dVector_matrix)
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # Store the document names for later: documentNames = fields.map(lambda x: x[1]) # Now hash the words in each document to their term frequencies: hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # At this point we have an RDD of sparse vectors representing each document, # where each value maps to the term frequency of each unique hash value. # Let's compute the TF*IDF of each term in each document: tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Now we have an RDD of sparse vectors, where each value is the TFxIDF # of each unique hash value for each document. model = PCAmllib(2).fit(tfidf) pc = model.transform(tfidf) #mat = RowMatrix(tfidf) # Calculate PCA #pc = mat.computePrincipalComponents(int(mat.numCols)) print("Principal components :") print(pc)
# ''' # cross_RDD = ID_tokens.cartesian(ID_tokens).cache() # # commonTokens: [[id1, id2], [tokens]] # commonTokens = cross_RDD.map(get_common) # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache() # # end = time.time() # print 'total prepare: '+ str(end - start) # print similarities_RDD.count() # c_time = time.time() # print 'count time: ' + str(c_time - end) # similarities_RDD.collect() # c2_time = time.time() # print 'count time: ' + str(c2_time - c_time) # print 'Successfully Calculated the similarities between all the posts' if __name__ == '__main__': conf = SparkConf() conf.set("spark.executor.memory", "16g") conf.set("spark.driver.memory","16g") conf.set("spark.driver.maxResultSize","16g") sc = SparkContext(conf=conf) tfidf_matrix = create_tfidf(sc) tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row)) start2 = time.time() model = PCA(20).fit(tfidf_dVector_matrix) end2 = time.time() print (end2 - start2) after_pca = model.transform(tfidf_dVector_matrix).collect
# LOADING AND COMPUTING TF's TRAINING MODEL print('Loading TRAINING_TF_MODEL...') tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' + str(feature_dim)) print('done!') print('Computing TF-IDF MODEL...') idf_training = IDF(minDocFreq=5).fit(tf_training) tfidf_training = idf_training.transform(tf_training) print('done!') # APPLYING PCA ON TRAINING DATA if pca_mode.value == 1: print('Applying PCA on training data...') PCA_model = PCA(low_dim).fit(tfidf_training) tfidf_training = PCA_model.transform(tfidf_training) k = low_dim # pcArray = model.transform(tfidf_training.first()).toArray() #setting checkpoint # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint") # CREATING DStream FROM TRAINING'S RDD trainingQueue = [tfidf_training] trainingStream = ssc.queueStream(trainingQueue) # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND model = StreamingKMeans(k=2, decayFactor=1.0, timeUnit='batches').setRandomCenters(k, 1.0, 0)
def reduceDimensions(features_rdd): model = PCAmllib(2).fit(features_rdd) transformed_rdd = model.transform(features_rdd) return transformed_rdd
m_source_list=[key[0], key[1], key[2]] + \ [v[1] for v in vals] + \ [v[2] for v in vals] + \ [v[3] for v in vals] + \ [v[4] for v in vals] + \ [v[5] for v in vals] return Vectors.dense(m_source_list) # COMMAND ---------- m_file_name = '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010' lines = sc.textFile(m_file_name, minPartitions=4) newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow) # COMMAND ---------- t = newrows.first() print type(t), t # COMMAND ---------- from pyspark.mllib.feature import PCA as PCAmllib model = PCAmllib(2).fit(newrows) transformed = model.transform(newrows) # COMMAND ---------- t = transformed.first() print type(t), t
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # Store the document names for later: documentNames = fields.map(lambda x: x[1]) # Now hash the words in each document to their term frequencies: hashingTF = HashingTF(100000) # 100K hash buckets just to save some memory tf = hashingTF.transform(documents) # At this point we have an RDD of sparse vectors representing each document, # where each value maps to the term frequency of each unique hash value. # Let's compute the TF*IDF of each term in each document: tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Now we have an RDD of sparse vectors, where each value is the TFxIDF # of each unique hash value for each document. model = PCAmllib(2).fit(tfidf) pc = model.transform(tfidf) # mat = RowMatrix(tfidf) # Calculate PCA # pc = mat.computePrincipalComponents(int(mat.numCols)) print("Principal components :") print(pc)
# Get average overall rating per review length (char count) review_length_cc, = averages_per_key(reviewer_vectors, lambda x: (x[1][4], [x[1][2]])) review_length_wc, = averages_per_key(reviewer_vectors, lambda x: (x[1][6], [x[1][2]])) result_collection["review_length_char_count"] = review_length_cc result_collection["review_length_word_count"] = review_length_wc # Conduct PCA reviewer_vectors_real = reviewer_vectors.map( lambda x: Vectors.dense([val for val in x[1]])) pca_model = PCA(8).fit(reviewer_vectors_real) transformed = pca_model.transform(reviewer_vectors_real) current_best = None current_best_cost = float("inf") # Run K-Means for k in range(2, 70, 7): kmeans_model = KMeans.train(transformed, k, maxIterations=100, runs=10) cost = kmeans_model.computeCost(transformed) if cost < current_best_cost: current_best_cost = cost current_best = kmeans_model #current_best.save(sc, "reviews/kmeans_model")
sample_mean = scaler.call('mean') # Effectively scale the dataset: rdd_norm = scaler.transform(rdd_data) # In[Reduction]: # Compute PCA new dimensions: from pyspark.mllib.feature import PCA as PCAmllib Neof = 20 reducer = PCAmllib(Neof).fit(rdd_norm) # print type(reducer) # Effectively reduce the dataset: rdd_reduced = reducer.transform(rdd_norm) # print type(rdd_reduced) # In[Classification with k-mean]: ### Lancement de KMean pour creation du modele de classification from pyspark.mllib.clustering import KMeans as KMeansmllib import time start_time = time.time() NBCLUSTERS = 8 INITMODE = 'kmean||' # kmean|| or random clusters_kmean = KMeansmllib.train(rdd_reduced, NBCLUSTERS, maxIterations=200, runs=20,
m_source_list=[key[0], key[1], key[2]] + \ [v[1] for v in vals] + \ [v[2] for v in vals] + \ [v[3] for v in vals] + \ [v[4] for v in vals] + \ [v[5] for v in vals] return Vectors.dense(m_source_list) # COMMAND ---------- m_file_name= '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010' lines = sc.textFile(m_file_name, minPartitions=4) newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow) # COMMAND ---------- t= newrows.first() print type(t), t # COMMAND ---------- from pyspark.mllib.feature import PCA as PCAmllib model = PCAmllib(2).fit(newrows) transformed = model.transform(newrows) # COMMAND ---------- t= transformed.first() print type(t), t
spark = SparkSession\ .builder\ .appName("linearSVC Example")\ .getOrCreate() # $example on$ # Load training data inputData = spark.read.format("libsvm") \ .load("combined_data_svm.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) pca = PCAml(k=2, inputCol="features", outputCol="pca") model = PCAmllib(2).fit(train) transform = model.transform(train) predictions = model.inverse_transform(test) # score the model on test data. #predictions = lsvcModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # $example off$ spark.stop()