def PCAdata(df, num): Label = df.map(lambda p: p.label).zipWithIndex().map(lambda (label, index): (index, label)) Features = df.map(lambda p: p.features) pcaModel = PCA(num).fit(Features) projected = pcaModel.transform(Features) second = projected.zipWithIndex().map(lambda (features, index): (index, features)) result = Label.join(second).map( lambda (idx, (label, features)): LabeledPoint(label, features)) return result
def run_pca(sc): cpu_count = multiprocessing.cpu_count() cluster_loss = dict() for n in range(0, CLUSTERS): filename = "cluster_" + str(n) + ".csv" cl_file = CLUSTER_PATH + filename dataset = sc.textFile(cl_file, cpu_count) dataset = dataset.map( lambda line: Vectors.dense([float(x) for x in line.split(';')])) model = PCA(2).fit(dataset) transformed = model.transform(dataset) transformed_csv = transformed.map( lambda x: ';'.join(list(map(str, x)))) transformed_csv.coalesce(1).saveAsTextFile(PCA_PATH + "onehot_%s" % filename)
def pca_fit(parsed_Data): x = parsed_Data.map(lambda p: p.features) pc = PCA(5).fit(x) transformed = pc.transform(x) y = parsed_Data.map(lambda p: p.label) a = transformed.zip(y) paired = a.map(lambda line: LabeledPoint(line[1], line[0])) rdd2 = paired.randomSplit([0.8, 0.2]) model2 = LinearRegressionWithSGD.train(rdd2[0], iterations=100, step=0.00000001, regType=None) # Evaluate the model on training data valuesAndPreds = rdd2[1].map(lambda p: (p.label, model2.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE))
def preprocess(sc, files_rdd, labeled_spectra, cut, label=True, **kwargs): """ :param path: A path to the input data. It should be a directory containing the votable or FITS files. :param labeled_path: A path to the CSV file with spectra already labeled. These shall be resampled so that the have the same resolution as the unlabeled spectra. They shall undergo the same preprocessing as the rest. :param label: Set to False if you want to omit the label from the output. :return: A RDD DataFrame with the preprocessed spectra. It shall contain the labeled spectra at the beginning, followed by the unlabeled, whose label shall be set to -1. In case you *label* was set to False, the output shall not contain any label and the ordering shall be arbitrary. """ logger.info("Starting preprocessing") # TODO support archives cut_low = cut['low'] cut_high = cut['high'] spectra = files_rdd.map(lambda x: parse_spectra_file(x[0], x[1], cut_low, cut_high)).filter(lambda x: x is not None).cache() low, high = spectra.union(labeled_spectra.map(lambda x: x.drop(x.columns[-1], axis=1))).aggregate((0.0, sys.float_info.max), high_low_op, high_low_comb) mean_step = spectra.map(lambda x: x.columns[1] - x.columns[0]).mean() logger.debug("low %f high %f %f", low, high, mean_step) resampled_header = sc.broadcast(np.arange(low, high, mean_step)) spectra = spectra.map(lambda x: resample(x, resampled_header_broadcast=resampled_header)) if label: spectra = spectra.map(lambda x: x.assign(label=pd.Series([-1], index=x.index))) if labeled_spectra is not None: spectra = labeled_spectra.map(lambda x: resample(x, resampled_header_broadcast=resampled_header, label_col="label" if label else None, convolve=True)).union(spectra).repartition(kwargs.get("partitions", 100)) if kwargs.get('pca') is not None: namesByRow = spectra.zipWithIndex().map(lambda s: (s[1], (s[0].index, s[0]['label'].iloc[0])) if label else (s[1], s[0].index)) logger.info("Doing PCA") pca_params = kwargs['pca'] k = pca_params.get("k", 10) pca = PCA(k) fitted_pca = pca.fit(spectra.map(lambda x: Vectors.dense(x[x.columns[:-1]].iloc[0].values))) transformed_spectra = fitted_pca.transform(spectra. map(lambda x: transform_pca(x))).zipWithIndex(). \ map(lambda x: (x[1], x[0])).join(namesByRow) spectra = transformed_spectra.map(lambda x: pd.DataFrame(data=[x[1][0].tolist() + ([x[1][1][1]] if label else [])], index=x[1][1][0] if label else x[2], columns=range(k) + ['label'] if label else range(k))) return resampled_header.value, spectra.sortBy(lambda x: x['label'].values[0], ascending=False, numPartitions=kwargs.get("partitions", 100))
#data = sc.textFile("iris_data.txt") #for master local or standalone model data = sc.textFile( "hdfs://master:9000/root/pyspark_test/iris_data.txt") #for hadoop yarn parsedData = data.map(lambda line: array([x for x in line.split(',')])) first_data = parsedData.take(1)[0] data_row = len(first_data) #include many input and one output attributes params_only = parsedData.map( lambda x: Vectors.dense(np.float_(x[0:(data_row - 1)]))) #params_only.take(5) #the type of params_only is pyspark.rdd.PipelinedRDD #params_only=parsedData.map(lambda x: array(np.float_(x[0:(data_row-1)]))) model_test = PCAmllib(2).fit(params_only) transformed = model_test.transform(params_only) #transformed.collect() pca_2d = transformed.collect() # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) k = 3 clusters = KMeans.train(params_only, k,
# -*- coding:utf-8 -*- """" Program: PCA Description: 调用spark内置的PCA算法 Author: zhenglei - [email protected] Date: 2016-01-14 13:45:02 # Last modified: 2016-01-28 19:23:14 Python release: 2.7 """ # 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理 from numpy import array from pyspark import SparkContext from pyspark.mllib.feature import PCA from pyspark.mllib.linalg import Vectors if __name__ == '__main__': sc = SparkContext() tmpdatas = sc.textFile('pcaTestSet.txt') datas = tmpdatas.map(lambda line: Vectors.dense( array([float(line.split('\t')[0]), float(line.split('\t')[1])]))) print datas.collect()[0] # 将输入降维成1维数据,并测试降维模型的准确性 model = PCA(1).fit(datas) transforms = model.transform(datas) print transforms.collect()[0], array(transforms.collect()).shape # 测试输入[10.235186,11.321997]之后的降维值 print model.transform(array([10.235186, 11.321997])) sc.stop()
k_param=int('%%k%%') modelpath = ascontext.createTemporaryFolder() mbr = ModelBuildReporter(sc) # create a DataModelTools to handle data model and data conversions dmt = DataModelTools() # compute the data model from the dataframe # data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields datamodel = dmt.computeDataModel(df.select(*predictors)) # use DataModelTools to convert from DataFrame to an RDD of DenseVector for specified predictors lp = dmt.extractDenseVector(df,predictors,setToFlag=1.0).map(lambda x:x[1]).cache() # build the PCA Model from pyspark.mllib.feature import PCA estimator = PCA(k_param) pcamodel = estimator.fit(lp) # extract the model coefficients by creating dummy data with each row containing # a predictor set to 1 and all others set to 0 (python wrapper does not seem to provide direct access in Spark 1.5) coefficients = [] n_predictors = len(predictors) for i in range(0,k_param): coefficients.append(list([0.0]*n_predictors)) for c in range(0,len(predictors)): vec = [0.0]*len(predictors) vec[c] = 1.0 arr = pcamodel.transform(DenseVector(vec)).toArray() for i in range(0,k_param): coefficients[i][c] = arr[i]
# # # ''' # The A.cartesian(B) will be an RDD of the form: # [(A ID1, A String1), (A ID2, A String2), ...] and [(B ID1, B String1), (B ID2, B String2), ...] # to: # [ ((A ID1, A String1), (B ID1, B String1)), ((A ID1, A String1), (B ID2, B String2)), ((A URL2, A String2), (B ID1, B String1)), ... ]¶ # ''' # cross_RDD = ID_tokens.cartesian(ID_tokens).cache() # # commonTokens: [[id1, id2], [tokens]] # commonTokens = cross_RDD.map(get_common) # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache() # # end = time.time() # print 'total prepare: '+ str(end - start) # print similarities_RDD.count() # c_time = time.time() # print 'count time: ' + str(c_time - end) # similarities_RDD.collect() # c2_time = time.time() # print 'count time: ' + str(c2_time - c_time) # print 'Successfully Calculated the similarities between all the posts' if __name__ == '__main__': sc = SparkContext('local') tfidf_matrix = create_tfidf(sc) tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row)) reduc = PCA(3).fit(tfidf_dVector_matrix) after_pca = reduc.transform(tfidf_dVector_matrix)
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # Store the document names for later: documentNames = fields.map(lambda x: x[1]) # Now hash the words in each document to their term frequencies: hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # At this point we have an RDD of sparse vectors representing each document, # where each value maps to the term frequency of each unique hash value. # Let's compute the TF*IDF of each term in each document: tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Now we have an RDD of sparse vectors, where each value is the TFxIDF # of each unique hash value for each document. model = PCAmllib(2).fit(tfidf) pc = model.transform(tfidf) #mat = RowMatrix(tfidf) # Calculate PCA #pc = mat.computePrincipalComponents(int(mat.numCols)) print("Principal components :") print(pc)
from pyspark.mllib.feature import StandardScaler, StandardScalerModel scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data) sample_mean = scaler.call('mean') # Effectively scale the dataset: rdd_norm = scaler.transform(rdd_data) # In[Reduction]: # Compute PCA new dimensions: from pyspark.mllib.feature import PCA as PCAmllib Neof = 20 reducer = PCAmllib(Neof).fit(rdd_norm) # print type(reducer) # Effectively reduce the dataset: rdd_reduced = reducer.transform(rdd_norm) # print type(rdd_reduced) # In[Classification with k-mean]: ### Lancement de KMean pour creation du modele de classification from pyspark.mllib.clustering import KMeans as KMeansmllib import time start_time = time.time() NBCLUSTERS = 8 INITMODE = 'kmean||' # kmean|| or random
result_collection["review_counts_helpful%"] = review_counts_hp # Get average overall rating per review length (char count) review_length_cc, = averages_per_key(reviewer_vectors, lambda x: (x[1][4], [x[1][2]])) review_length_wc, = averages_per_key(reviewer_vectors, lambda x: (x[1][6], [x[1][2]])) result_collection["review_length_char_count"] = review_length_cc result_collection["review_length_word_count"] = review_length_wc # Conduct PCA reviewer_vectors_real = reviewer_vectors.map( lambda x: Vectors.dense([val for val in x[1]])) pca_model = PCA(8).fit(reviewer_vectors_real) transformed = pca_model.transform(reviewer_vectors_real) current_best = None current_best_cost = float("inf") # Run K-Means for k in range(2, 70, 7): kmeans_model = KMeans.train(transformed, k, maxIterations=100, runs=10) cost = kmeans_model.computeCost(transformed) if cost < current_best_cost: current_best_cost = cost current_best = kmeans_model
# ''' # cross_RDD = ID_tokens.cartesian(ID_tokens).cache() # # commonTokens: [[id1, id2], [tokens]] # commonTokens = cross_RDD.map(get_common) # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache() # # end = time.time() # print 'total prepare: '+ str(end - start) # print similarities_RDD.count() # c_time = time.time() # print 'count time: ' + str(c_time - end) # similarities_RDD.collect() # c2_time = time.time() # print 'count time: ' + str(c2_time - c_time) # print 'Successfully Calculated the similarities between all the posts' if __name__ == '__main__': conf = SparkConf() conf.set("spark.executor.memory", "16g") conf.set("spark.driver.memory","16g") conf.set("spark.driver.maxResultSize","16g") sc = SparkContext(conf=conf) tfidf_matrix = create_tfidf(sc) tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row)) start2 = time.time() model = PCA(20).fit(tfidf_dVector_matrix) end2 = time.time() print (end2 - start2) after_pca = model.transform(tfidf_dVector_matrix).collect
# LOADING AND COMPUTING TF's TRAINING MODEL print('Loading TRAINING_TF_MODEL...') tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' + str(feature_dim)) print('done!') print('Computing TF-IDF MODEL...') idf_training = IDF(minDocFreq=5).fit(tf_training) tfidf_training = idf_training.transform(tf_training) print('done!') # APPLYING PCA ON TRAINING DATA if pca_mode.value == 1: print('Applying PCA on training data...') PCA_model = PCA(low_dim).fit(tfidf_training) tfidf_training = PCA_model.transform(tfidf_training) k = low_dim # pcArray = model.transform(tfidf_training.first()).toArray() #setting checkpoint # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint") # CREATING DStream FROM TRAINING'S RDD trainingQueue = [tfidf_training] trainingStream = ssc.queueStream(trainingQueue) # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND model = StreamingKMeans(k=2, decayFactor=1.0, timeUnit='batches').setRandomCenters(k, 1.0, 0)
def reduceDimensions(features_rdd): model = PCAmllib(2).fit(features_rdd) transformed_rdd = model.transform(features_rdd) return transformed_rdd
m_source_list=[key[0], key[1], key[2]] + \ [v[1] for v in vals] + \ [v[2] for v in vals] + \ [v[3] for v in vals] + \ [v[4] for v in vals] + \ [v[5] for v in vals] return Vectors.dense(m_source_list) # COMMAND ---------- m_file_name = '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010' lines = sc.textFile(m_file_name, minPartitions=4) newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow) # COMMAND ---------- t = newrows.first() print type(t), t # COMMAND ---------- from pyspark.mllib.feature import PCA as PCAmllib model = PCAmllib(2).fit(newrows) transformed = model.transform(newrows) # COMMAND ---------- t = transformed.first() print type(t), t
rdd_loaded.count() rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x)) print rdd_b.count() print rdd_b.take(1) # # Profiles standardisation # new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b) print type(new_scalar) scaler3 = new_scalar.transform(rdd_b) # # Profiles compression with PCA # model = PCAmllib(10).fit(scaler3) print type(model) transformed = model.transform(scaler3) print type(transformed) print transformed.count() print transformed.first() # # Train a Profiles classification model with KMean # NBCLUSTERS = 8 INITMODE = 'kmean||' # kmean|| or random clusters = mllibKMeans.train(transformed, NBCLUSTERS, maxIterations=100, initializationMode=INITMODE)
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # Store the document names for later: documentNames = fields.map(lambda x: x[1]) # Now hash the words in each document to their term frequencies: hashingTF = HashingTF(100000) # 100K hash buckets just to save some memory tf = hashingTF.transform(documents) # At this point we have an RDD of sparse vectors representing each document, # where each value maps to the term frequency of each unique hash value. # Let's compute the TF*IDF of each term in each document: tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Now we have an RDD of sparse vectors, where each value is the TFxIDF # of each unique hash value for each document. model = PCAmllib(2).fit(tfidf) pc = model.transform(tfidf) # mat = RowMatrix(tfidf) # Calculate PCA # pc = mat.computePrincipalComponents(int(mat.numCols)) print("Principal components :") print(pc)
m_source_list=[key[0], key[1], key[2]] + \ [v[1] for v in vals] + \ [v[2] for v in vals] + \ [v[3] for v in vals] + \ [v[4] for v in vals] + \ [v[5] for v in vals] return Vectors.dense(m_source_list) # COMMAND ---------- m_file_name= '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010' lines = sc.textFile(m_file_name, minPartitions=4) newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow) # COMMAND ---------- t= newrows.first() print type(t), t # COMMAND ---------- from pyspark.mllib.feature import PCA as PCAmllib model = PCAmllib(2).fit(newrows) transformed = model.transform(newrows) # COMMAND ---------- t= transformed.first() print type(t), t
mbr = ModelBuildReporter(sc) # create a DataModelTools to handle data model and data conversions dmt = DataModelTools() # compute the data model from the dataframe # data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields datamodel = dmt.computeDataModel(df.select(*predictors)) # use DataModelTools to convert from DataFrame to an RDD of DenseVector for specified predictors lp = dmt.extractDenseVector(df, predictors, setToFlag=1.0).map(lambda x: x[1]).cache() # build the PCA Model from pyspark.mllib.feature import PCA estimator = PCA(k_param) pcamodel = estimator.fit(lp) # extract the model coefficients by creating dummy data with each row containing # a predictor set to 1 and all others set to 0 (python wrapper does not seem to provide direct access in Spark 1.5) coefficients = [] n_predictors = len(predictors) for i in range(0, k_param): coefficients.append(list([0.0] * n_predictors)) for c in range(0, len(predictors)): vec = [0.0] * len(predictors) vec[c] = 1.0 arr = pcamodel.transform(DenseVector(vec)).toArray() for i in range(0, k_param): coefficients[i][c] = arr[i]
"""" Program: PCA Description: 调用spark内置的PCA算法 Author: zhenglei - [email protected] Date: 2016-01-14 13:45:02 # Last modified: 2016-01-28 19:23:14 Python release: 2.7 """ # 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理 from numpy import array from pyspark import SparkContext from pyspark.mllib.feature import PCA from pyspark.mllib.linalg import Vectors if __name__ == '__main__': sc = SparkContext() tmpdatas = sc.textFile('pcaTestSet.txt') datas = tmpdatas.map(lambda line: Vectors.dense( array([float(line.split('\t')[0]), float(line.split('\t')[1])]))) print datas.collect()[0] # 将输入降维成1维数据,并测试降维模型的准确性 model = PCA(1).fit(datas) transforms = model.transform(datas) print transforms.collect()[0], array(transforms.collect()).shape # 测试输入[10.235186,11.321997]之后的降维值 print model.transform(array([10.235186, 11.321997])) sc.stop()
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("linearSVC Example")\ .getOrCreate() # $example on$ # Load training data inputData = spark.read.format("libsvm") \ .load("combined_data_svm.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) pca = PCAml(k=2, inputCol="features", outputCol="pca") model = PCAmllib(2).fit(train) transform = model.transform(train) predictions = model.inverse_transform(test) # score the model on test data. #predictions = lsvcModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # $example off$