from pyspark.mllib.feature import StandardScaler, StandardScalerModel scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data) sample_mean = scaler.call('mean') # Effectively scale the dataset: rdd_norm = scaler.transform(rdd_data) # In[Reduction]: # Compute PCA new dimensions: from pyspark.mllib.feature import PCA as PCAmllib Neof = 20 reducer = PCAmllib(Neof).fit(rdd_norm) # print type(reducer) # Effectively reduce the dataset: rdd_reduced = reducer.transform(rdd_norm) # print type(rdd_reduced) # In[Classification with k-mean]: ### Lancement de KMean pour creation du modele de classification from pyspark.mllib.clustering import KMeans as KMeansmllib import time start_time = time.time() NBCLUSTERS = 8 INITMODE = 'kmean||' # kmean|| or random
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # Store the document names for later: documentNames = fields.map(lambda x: x[1]) # Now hash the words in each document to their term frequencies: hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # At this point we have an RDD of sparse vectors representing each document, # where each value maps to the term frequency of each unique hash value. # Let's compute the TF*IDF of each term in each document: tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Now we have an RDD of sparse vectors, where each value is the TFxIDF # of each unique hash value for each document. model = PCAmllib(2).fit(tfidf) pc = model.transform(tfidf) #mat = RowMatrix(tfidf) # Calculate PCA #pc = mat.computePrincipalComponents(int(mat.numCols)) print("Principal components :") print(pc)
rdd_loaded.count() rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x)) print rdd_b.count() print rdd_b.take(1) # # Profiles standardisation # new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b) print type(new_scalar) scaler3 = new_scalar.transform(rdd_b) # # Profiles compression with PCA # model = PCAmllib(10).fit(scaler3) print type(model) transformed = model.transform(scaler3) print type(transformed) print transformed.count() print transformed.first() # # Train a Profiles classification model with KMean # NBCLUSTERS = 8 INITMODE = 'kmean||' # kmean|| or random clusters = mllibKMeans.train(transformed, NBCLUSTERS, maxIterations=100, initializationMode=INITMODE)
#data = sc.textFile("iris_data.txt") #for master local or standalone model data = sc.textFile( "hdfs://master:9000/root/pyspark_test/iris_data.txt") #for hadoop yarn parsedData = data.map(lambda line: array([x for x in line.split(',')])) first_data = parsedData.take(1)[0] data_row = len(first_data) #include many input and one output attributes params_only = parsedData.map( lambda x: Vectors.dense(np.float_(x[0:(data_row - 1)]))) #params_only.take(5) #the type of params_only is pyspark.rdd.PipelinedRDD #params_only=parsedData.map(lambda x: array(np.float_(x[0:(data_row-1)]))) model_test = PCAmllib(2).fit(params_only) transformed = model_test.transform(params_only) #transformed.collect() pca_2d = transformed.collect() # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) k = 3 clusters = KMeans.train(params_only, k,
def reduceDimensions(features_rdd): model = PCAmllib(2).fit(features_rdd) transformed_rdd = model.transform(features_rdd) return transformed_rdd
m_source_list=[key[0], key[1], key[2]] + \ [v[1] for v in vals] + \ [v[2] for v in vals] + \ [v[3] for v in vals] + \ [v[4] for v in vals] + \ [v[5] for v in vals] return Vectors.dense(m_source_list) # COMMAND ---------- m_file_name = '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010' lines = sc.textFile(m_file_name, minPartitions=4) newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow) # COMMAND ---------- t = newrows.first() print type(t), t # COMMAND ---------- from pyspark.mllib.feature import PCA as PCAmllib model = PCAmllib(2).fit(newrows) transformed = model.transform(newrows) # COMMAND ---------- t = transformed.first() print type(t), t
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("linearSVC Example")\ .getOrCreate() # $example on$ # Load training data inputData = spark.read.format("libsvm") \ .load("combined_data_svm.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) pca = PCAml(k=2, inputCol="features", outputCol="pca") model = PCAmllib(2).fit(train) transform = model.transform(train) predictions = model.inverse_transform(test) # score the model on test data. #predictions = lsvcModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # $example off$