def markDelay(v): return LabeledPoint(v[0], np.array(v[1:]))
mse = scoreAndLabels.map(lambda (a, b): (a - b)**2).mean() rmse = math.sqrt(mse) #metrics = RegressionMetrics(scoreAndLabels) #RMSE=metrics.rootMeanSquaredError return mae, rmse sc = SparkContext() selcol = [1, 3, 4, 6, 18, 23, 25] train = prep_Data("HW4/200[3-7].csv", selcol) test = prep_Data("HW4/2008.csv", selcol) #transform data into the format that can be feed into model trainLabeled = train.map( lambda line: LabeledPoint(extract_label(line), extract_features(line))) testLabeled = test.map( lambda line: LabeledPoint(extract_label(line), extract_features(line))) #preserver some part of the data as validation data train_dataset, val_dataset = trainLabeled.randomSplit([0.7, 0.3]) #train linear_model_val = LinearRegressionWithSGD.train(train_dataset, 100000, 0.00000000001) linear_model = LinearRegressionWithSGD.train(trainLabeled, 100000, 0.00000000001) #evaluateModel(linear_model_val, val_dataset) #evaluateModel(linear_model, testLabeled)
def main(sc): train_id = utils.load("data_id/train.p") test_id = utils.load("data_id/test.p") meta(train_id) train_id = [[idx] for idx in train_id] test_id = [[idx] for idx in test_id] sqlContext = SQLContext(sc) train_f = sqlContext.createDataFrame(train_id, ['biz_id']) test_f = sqlContext.createDataFrame(test_id, ['biz_id']) # Register user defined functions # city = udf(lambda b_id: get_city(b_id), StringType()) state = udf(lambda b_id: MLVectors.dense(get_state(b_id)), VectorUDT()) stars = udf(lambda b_id: get_stars(b_id), FloatType()) popularity = udf(lambda b_id: get_popularity(b_id), IntegerType()) name_size = udf(lambda b_id: get_name_size(b_id), IntegerType()) name_polar = udf(lambda b_id: get_name_polar(b_id), FloatType()) pos_neg_score = udf(lambda b_id: MLVectors.dense(get_PosNeg_score(b_id)), VectorUDT()) # clarity = udf(lambda b_id: get_clarity(b_id), ArrayType(FloatType())) elite_cnt = udf(lambda b_id: get_elite_cnt(b_id), IntegerType()) label = udf(lambda b_id: get_y(b_id), IntegerType()) # Generate feature columns # data_f = data_f.withColumn("city", city(data_f['biz_id'])) train_f = train_f.withColumn("state", state(train_f['biz_id'])) train_f = train_f.withColumn("stars", stars(train_f['biz_id'])) train_f = train_f.withColumn("popularity", popularity(train_f['biz_id'])) train_f = train_f.withColumn("name_size", name_size(train_f['biz_id'])) train_f = train_f.withColumn("name_polar", name_polar(train_f['biz_id'])) train_f = train_f.withColumn("pos_neg_score", pos_neg_score(train_f['biz_id'])) # data_f = data_f.withColumn("clarity", clarity(data_f['biz_id'])) train_f = train_f.withColumn("elite_cnt", elite_cnt(train_f['biz_id'])) train_f = train_f.withColumn("y", label(train_f['biz_id'])) train_f.show(5) # Generate feature columns test_f = test_f.withColumn("state", state(test_f['biz_id'])) test_f = test_f.withColumn("stars", stars(test_f['biz_id'])) test_f = test_f.withColumn("popularity", popularity(test_f['biz_id'])) test_f = test_f.withColumn("name_size", name_size(test_f['biz_id'])) test_f = test_f.withColumn("name_polar", name_polar(test_f['biz_id'])) test_f = test_f.withColumn("pos_neg_score", pos_neg_score(test_f['biz_id'])) test_f = test_f.withColumn("elite_cnt", elite_cnt(test_f['biz_id'])) test_f = test_f.withColumn("y", label(test_f['biz_id'])) test_f.show(5) # One-hot encoding # encoder = OneHotEncoder(inputCol="state", outputCol="stateVec") # train_f = encoder.transform(train_f) train_f.show(5) # test_f = encoder.transform(test_f) test_f.show(5) # Assemble columns to features assembler = VectorAssembler(inputCols=[ "state", "stars", "popularity", "name_size", "name_polar", "pos_neg_score", "elite_cnt" ], outputCol="features") train_f = assembler.transform(train_f) train_f.show(5) test_f = assembler.transform(test_f) test_f.show(5) train_f = train_f.filter(train_f.y != -1) test_f = test_f.filter(test_f.y != -1) train_d = (train_f.select(col("y"), col("features")) \ .rdd \ .map(lambda row: LabeledPoint(float(row.y), MLLibVectors.fromML(row.features)))) m = SVMWithSGD.train(train_d) predictionAndLabels = test_f.rdd.map(lambda row: (float( m.predict(MLLibVectors.fromML(row.features))), float(row.y))) # Grid search for best params and model # scores = {} # max_score = 0 # for m in model_list: # print ('run', m) # evaluator = BinaryClassificationEvaluator() # cv = CrossValidator(estimator=model_list[m], # estimatorParamMaps=params_list[m], # evaluator=evaluator, # numFolds=3) # cv.fit(train) # scores[m] = cv.get_best_score() # if scores[m] > max_score: # op_params = params_list[m][cv.get_best_index()] # op_model = cv.get_best_model() # op_m_name = m # predictionAndLabels = test.map(lambda lp: (float(op_model.predict(lp.features)), lp.y)) # Instantiate metrics object bi_metrics = BinaryClassificationMetrics(predictionAndLabels) mul_metrics = MulticlassMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % bi_metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % bi_metrics.areaUnderROC) # Confusion Matrix print("Confusion Matrix") print(mul_metrics.confusionMatrix().toArray()) # Overall statistics precision = mul_metrics.precision() recall = mul_metrics.recall() f1Score = mul_metrics.fMeasure() accuracy = mul_metrics.accuracy print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) print("Accuracy = %s" % accuracy) # Individual label stats labels = [0, 1] for label in labels: print("Class %s precision = %s" % (label, mul_metrics.precision(label))) print("Class %s recall = %s" % (label, mul_metrics.recall(label)))
from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Create a labeled point with a positive label and a dense feature vector. pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) # 生成label 和 features 行 neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) from pyspark.mllib.linalg import Matrix, Matrices # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) dm2 = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6]) # 稀疏矩阵,尺寸,各个索引(数量,长度和尺寸对应),值 sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
labelIndexer = StringIndexer(inputCol="f3", outputCol="att_f3") model = labelIndexer.fit(df4) df5 = model.transform(df4) from pyspark.mllib.linalg import Vectors from pyspark.ml.feature import VectorAssembler va = VectorAssembler(inputCols=["att_a", "att_f1", "att_f2", "att_f3"], outputCol="features") df6 = va.transform(df5) df7 = df6.withColumnRenamed('lables', 'label') trainDf = df7.select('label', 'features') trainDf.printSchema() print trainDf.show() from pyspark.mllib.regression import LabeledPoint trainRdd = trainDf.map(lambda row: LabeledPoint(row.label, row.features)) print trainRdd.take(20) from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(maxIter=10, regParam=0.01) model1 = lr.fit(trainDf) print model1.coefficients print model1.intercept from pyspark.sql import Row test0 = sc.parallelize([Row(features=Vectors.dense(2, 0, 0, 1))]).toDF() result = model1.transform(test0).head() print result.prediction
def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(')')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec)
else: return acum / count sc = SparkContext(conf=SparkConf()) learn = sc.textFile('parsedTrainSmall.csv', 8) learn = learn.map(lambda x: x.split('|')).map(lambda x: (x[0], x[ 2], dame_minhashes_shingles2(dame_shingles_words(x[1], 3, 15)))) from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import DenseVector data_for_decision_tree = learn.map( lambda x: LabeledPoint(label=x[1], features=DenseVector(x[2]))) (dataTrain, dataTest) = data_for_decision_tree.randomSplit([0.7, 0.3]) model = DecisionTree.trainRegressor(dataTrain, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) predictions = model.predict(dataTest.map(lambda x: x.features)) labelsAndPredictions = dataTest.map(lambda x: x.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(dataTest.count()) print "MSE = %f" % (testMSE) learn = learn.map(lambda x: (x[0], x[1], dame_hash_bandas(x[2]))) learn = learn.flatMap(lambda x: flatmapeo(x[0], x[1], x[2]) ) #(u'a9wx8dk93sn5', u'1.0', 813759583895638922)
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.linalg import SparseVector, DenseVector sparse_data = [ LabeledPoint(0.0, DenseVector([0, 1.0, 2])), LabeledPoint(1.0, DenseVector([0, 1, 1.0])), LabeledPoint(0.0, DenseVector([1, 0, 1.0])), LabeledPoint(1.0, DenseVector([1, 1.3, 2.0])), LabeledPoint(1.0, DenseVector([1, 2.1, 1.6])), ] #data = sc.parallelize(sparse_data) data = None model = GradientBoostedTrees.trainRegressor(data, categoricalFeaturesInfo={0:2}, numIterations=10) print(model.numTrees()) print(model.totalNumNodes()) model.predict(DenseVector([1, 1, 1.0])) model.predict(DenseVector([0, 0, 1.0])) #rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) #model.predict(rdd).collect()
#tokenizing the paragraphs for words tokenizer = Tokenizer(inputCol="review", outputCol="words") #transformation wordsData = tokenizer.transform(schemeReview) #Hashing the words input hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) #transforming the data to hash featurizedData = hashingTF.transform(wordsData) #instantiating the IDF model idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label", "features", "id") #Creating RDD of LabeledPoints lpSelectData = selectData.map(lambda x: (x.id, LabeledPoint(x.label, x.features))) #Spliting the data for training and test (trainingData, testData) = lpSelectData.randomSplit([0.9, 0.1]) # training the Logistic regression with LBFGS model lrm = LogisticRegressionWithLBFGS.train(trainingData.map(lambda x: x[1]), iterations=10) #fetching the labels and predictions for test data labelsAndPreds = testData.map(lambda p: (p[0], p[1].label, lrm.predict(p[1].features))) #calculating the accuracy and printing it. accuracy = labelsAndPreds.filter(lambda (i, v, p): v == p).count() / float( testData.count()) print("Accuracy = " + str(accuracy))
header="false") val = spark.read.load("hdfs://10.190.2.112/data/val_set.txt", format="csv", sep="\t", inferSchema="true", header="false") test = spark.read.load("hdfs://10.190.2.112/data/test_set.txt", format="csv", sep="\t", inferSchema="true", header="false") # create features and labels HDF = HashingTF(50) train = train.rdd.map( lambda x: LabeledPoint(x[6] == 'E', HDF.transform([x[2], x[3]]))) test = test.rdd.map( lambda x: LabeledPoint(x[6] == 'E', HDF.transform([x[2], x[3]]))) val = val.rdd.map( lambda x: LabeledPoint(x[6] == 'E', HDF.transform([x[2], x[3]]))) with open('H2_15300180012_output.txt', 'w') as f: f.write('H2_15300180012_output\n') def do_training(para=1.0): with open('H2_15300180012_output.txt', 'a') as f: f.write('Naive Bayes parameter: {} \n'.format(para)) # Train a naive Bayes model. model = NaiveBayes.train(train, para)
def parseInput(line): return LabeledPoint(float(line[1]),line[0])
def get_RDDs(data, corpus, weights, questions, labels): w2v = word2vec.Word2Vec(corpus, size=100, window=20, min_count=1, workers=40) ''' one : tfidf scores only two : word2vec vectors only three : jaccard index only four : word2vec * tfidf five : word2vec * tfidf, jaccard index sum or mean : way the word vectors for the entire sentence were combined into one vector or number cosine or squeclidean : similarity measurement on the two sum/mean vectors ''' one = questions.map(lambda x: (weight_vector_tfidf(weights, x[0]), weight_vector_tfidf(weights, x[1]))) one_sum = one.map(lambda x: (get_sum(x[0]), get_sum(x[1]))) one_mean = one.map(lambda x: (get_mean(x[0]), get_mean(x[1]))) two = questions.map( lambda x: (weight_vector_w2v(w2v, x[0]), weight_vector_w2v(w2v, x[1]))) two_sum = two.map(lambda x: (sum_w2v(x[0]), sum_w2v(x[1]))) two_mean = two.map(lambda x: (mean_w2v(x[0]), mean_w2v(x[1]))) three = questions.map(lambda x: jaccard_index(x[0], x[1])) four = questions.map(lambda x: (weight_vector_both(weigts, w2v, x[0]), weight_vector_both(weights, w2v, x[1]))) four_sum = four.map(lambda x: (sum_w2v(x[0]), sum_w2v(x[1]))) four_mean = four.map(lambda x: (mean_w2v(x[0]), mean_w2v(x[1]))) five_sum = four_sum.zip(three) five_mean = four_mean.zip(three) labels = labels.coalesce(1) one_sum_difference = labels.zip( one_sum.map(lambda x: abs(x[0] - x[1])).coalesce(1)).repartition( 100).map(lambda x: LabeledPoint(x[0], [x[1]])) one_mean_difference = labels.zip( one_mean.map(lambda x: abs(x[0] - x[1])).coalesce(1)).repartition( 100).map(lambda x: LabeledPoint(x[0], [x[1]])) two_sum_cosine = labels.zip( two_sum.map(lambda x: get_cosine(x)).coalesce(1)).repartition(100).map( lambda x: LabeledPoint(x[0], [x[1]])) two_sum_sqeuclidean = labels.zip( two_sum.map(lambda x: sqeuclidean(x[0], x[1])).coalesce( 1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]])) two_mean_cosine = labels.zip( two_mean.map(lambda x: get_cosine(x)).coalesce(1)).repartition( 100).map(lambda x: LabeledPoint(x[0], [x[1]])) two_mean_sqeuclidean = labels.zip( two_mean.map(lambda x: sqeuclidean(x[0], x[1])).coalesce( 1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]])) three = labels.zip( three.coalesce(1)).map(lambda x: LabeledPoint(x[0], [x[1]])) four_sum_cosine = labels.zip( four_sum.map(lambda x: get_cosine(x)).coalesce(1)).repartition( 100).map(lambda x: LabeledPoint(x[0], [x[1]])) four_sum_sqeuclidean = labels.zip( four_sum.map(lambda x: sqeuclidean(x[0], x[1])).coalesce( 1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]])) four_mean_cosine = labels.zip( four_mean.map(lambda x: get_cosine(x)).coalesce(1)).repartition( 100).map(lambda x: LabeledPoint(x[0], [x[1]])) four_mean_sqeuclidean = labels.zip( four_mean.map(lambda x: sqeuclidean(x[0], x[1])).coalesce( 1)).repartition(100).map(lambda x: LabeledPoint(x[0], [x[1]])) five_sum_cosine = labels.zip( five_sum.map(lambda x: (get_cosine(x[0]), x[1])).coalesce(1)).map( lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]])) five_sum_sqeuclidean = labels.zip( five_sum.map(lambda x: (sqeuclidean(x[0][0], x[0][1]), x[1])).coalesce( 1)).map(lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]])) five_mean_cosine = labels.zip( five_mean.map(lambda x: (get_cosine(x[0]), x[1])).coalesce(1)).map( lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]])) five_mean_sqeuclidean = labels.zip( five_mean.map(lambda x: (sqeuclidean(x[0][0], x[0][1]), x[1])). coalesce(1)).map(lambda x: LabeledPoint(x[0], [x[1][0], x[1][1]])) RDDs = [ one_sum_difference, one_mean_difference, two_sum_cosine, two_sum_sqeuclidean, two_mean_cosine, two_mean_sqeuclidean, three, four_sum_cosine, four_sum_sqeuclidean, four_mean_cosine, four_mean_sqeuclidean, five_sum_cosine, five_mean_sqeuclidean, five_mean_cosine, five_mean_sqeuclidean ] return RDDs
def parsePoint(line): line = line.replace("[", '') line = line.replace("]", '') line = line.replace(" ", '') values = [int(x) for x in line.split(',')] return LabeledPoint(values[0], values[1:])
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("Liner Regression").setMaster("yarn") sc = SparkContext(conf=conf) data = [ LabeledPoint(0, Vectors.dense([1, 0, 0])), LabeledPoint(0, Vectors.dense([2, 0, 0])), LabeledPoint(1, Vectors.dense([0, 1, 0])), LabeledPoint(1, Vectors.dense([0, 2, 0])), LabeledPoint(2, Vectors.dense([0, 0, 1])), LabeledPoint(2, Vectors.dense([0, 0, 2])) ] # $example on$ data = sc.parallelize(data) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) training.cache() model = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print("Test Data:") print(test.collect())
sc = SparkContext("local", "titanic_test") sqlContext = SQLContext(sc) df = pd.read_csv('Titanic_train.csv') df['Sex'] = df['Sex'].replace('female', 1) df['Sex'] = df['Sex'].replace('male', 0) df['Age'] = df['Age'].replace('NaN', -1) traindf = pd.DataFrame(df, columns=['Survived', 'Pclass', 'Age', 'Sex', 'Fare']) sdf = sqlContext.createDataFrame(traindf) import pyspark.mllib.classification as sparkclass temp = sdf.map(lambda x: LabeledPoint(x[0], [x[1:]])) #lrm = sparkclass.SVMWithSGD.train(temp,iterations=10) #lrm=sparkclass.LogisticRegressionWithSGD.train(temp,iterations=10) from pyspark.mllib.tree import DecisionTree, DecisionTreeModel lrm = DecisionTree.trainClassifier(temp, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) df = pd.read_csv('Titanic_test.csv')
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Project : tql-Python. # @File : libsvm2df # @Time : 2020-01-22 12:16 # @Author : yuanjie # @Email : [email protected] # @Software : PyCharm # @Description : https://blog.csdn.net/weixin_42286026/article/details/84496896 from pyspark.mllib.util import MLUtils sc = '' (MLUtils.loadLibSVMFile(sc, 'libsvm__').map( lambda r: (r.label, r.features.toArray())).saveAsTextFile('ffm')) from pyspark.mllib.regression import LabeledPoint labelpointRDD = sparkdf.rdd.map(lambda row: LabeledPoint(row[-1], row[:-1]))
def create_labeled_point(line_split): clean_line_split = line_split[0:41] # convert protocol to numeric categorical variable try: clean_line_split[1] = protocols.index(clean_line_split[1]) except: clean_line_split[1] = len(protocols) # convert service to numeric categorical variable try: clean_line_split[2] = services.index(clean_line_split[2]) except: clean_line_split[2] = len(services) # convert flag to numeric categorical variable try: clean_line_split[3] = flags.index(clean_line_split[3]) except: clean_line_split[3] = len(flags) # convert label to binary label # attack = 1.0 attack = 4.0 if line_split[41] == 'normal.': attack = 0.0 # elif line_split[41]=='back.': # if line_split[41]=='back.': # attack = 1.0 # elif line_split[41]=='land.': # attack = 2.0 # elif line_split[41]=='neptune.': # attack = 3.0 # elif line_split[41]=='pod.': # attack = 4.0 # elif line_split[41]=='smurf.': # attack = 5.0 # elif line_split[41]=='teardrop.': # attack = 6.0 elif line_split[41] == 'ipsweep.': # if line_split[41]=='ipsweep.': attack = 1.0 elif line_split[41] == 'nmap.': attack = 2.0 # elif line_split[41]=='portsweep.': # if line_split[41]=='portsweep.': # attack = 3.0 # elif line_split[41]=='normal.': # attack = 0.0 else: attack = 4.0 # elif line_split[41]=='imap.': # attack = 10.0 # elif line_split[41]=='ftp_write.': # attack = 11.0 # elif line_split[41]=='guess_passwd.': # attack = 12. line_split[41]=='spy.': # attack = 13.0 # elif line_split[41]=='warezclient.': # attack = 14.0 # elif line_split[41]=='warezmaster.': # attack = 15.0 # elif line_split[41]=='multihop.': # attack = 16.0 # elif line_split[41]=='phf.': # attack = 17.0 # elif line_split[41]=='buffer_overflow.': # attack = 18.0 # elif line_split[41]=='rootkit.': # attack = 19.0 # elif line_split[41]=='perl.': # attack = 20.0 # elif line_split[41]=='loadmodule.': # attack = 21.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD data = [ LabeledPoint(0.0, [0.0, 1.0]), LabeledPoint(1.0, [1.0, 0.0]), ] lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10) lrm.predict([1.0, 0.0]) lrm.predict([0.0, 1.0]) lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect() lrm.clearThreshold() lrm.predict([0.0, 1.0])
def parsePoint(line): data = line[1:][:-1] values = [float(x) for x in data.split(', ')] return LabeledPoint(1 if values[34] > 0.5 else 0, values[:-1])
def labeledPointConverter(row): try: return LabeledPoint(1.0, row[1:]) except ValueError: return LabeledPoint(50.0,[1.0])
def labelData(data): return data.map(lambda row: LabeledPoint(row[9], [ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[10], row[11], row[12], row[13], row[14], row[15] ]))
from pyspark.mllib.util import MLUtils if __name__ == "__main__": if len(sys.argv) not in [1, 2]: print("Usage: correlations (<file>)", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonCorrelations") if len(sys.argv) == 2: filepath = sys.argv[1] else: filepath = 'sample_linear_regression_data.txt' corrType = 'pearson' points = MLUtils.loadLibSVMFile(sc, filepath)\ .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray())) print() print('Summary of data file: ' + filepath) print('%d data points' % points.count()) # Statistics (correlations) print() print('Correlation (%s) between label and each feature' % corrType) print('Feature\tCorrelation') numFeatures = points.take(1)[0].features.size labelRDD = points.map(lambda lp: lp.label) for i in range(numFeatures): featureRDD = points.map(lambda lp: lp.features[i]) corr = Statistics.corr(labelRDD, featureRDD, corrType) print('%d\t%g' % (i, corr))
i += 1 step += len(m) num_vec = np.array([float(field) for field in record[5:6]]) return np.concatenate((cat_vec, num_vec)) def extract_hp_label(record): return record[6] def extract_acc_label(record): return record[5] accData = records.map( lambda r: LabeledPoint(extract_acc_label(r), extract_features(r))) hpData = records.map( lambda r: LabeledPoint(extract_hp_label(r), extract_features(r))) acc_first_point = accData.first() hp_first_point = hpData.first() def extract_features_dt(record): return np.array(map(float, record[5:6])) # Decision Tree Method # Feature vector creation for acceleration data_dt_acc = records.map(
def buildTfIdfRddAllTopics(business, sports, politics, entertainment): business_df = buildTextRDD(business, BUSINESS_LABEL) politics_df = buildTextRDD(sports, POLITICS_LABEL) sports_df = buildTextRDD(politics, SPORTS_LABEL) entertainment_df = buildTextRDD(entertainment, ENTERTAINMENT_LABEL) # Union together all dataframes main_df = business_df.union(politics_df) main_df = main_df.union(sports_df) main_df = main_df.union(entertainment_df) main_df = main_df.withColumnRenamed('_1', 'label') main_df = main_df.withColumnRenamed('_2', 'content') tokenizer = Tokenizer(inputCol="content", outputCol="words") wordsData = tokenizer.transform(main_df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) return rescaledData.select([c for c in rescaledData.columns if c in ['label', 'features']]).rdd.map(lambda x: LabeledPoint(x.label, MLLibVectors.fromML(x.features)))
def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:])
""" :param RDD: RDD, created from step_level_features :return: RDD with features, aggregated over the trip """ trip_lv = RDD.map(lambda x: (x[0], (min(x[1][4]), max(x[1][4]), min(x[1][5]), max(x[1][5]), len(x[1][0]), sum(x[1][4]), np.mean(x[1][4]), np.std(x[1][4]), np.mean(x[1][5]), np.std(x[1][5]), sum([elem < 0.5 for elem in x[1][4]])), x[2])) return trip_lv def create_logistic_model(RDD): """ :param RDD: RDD, create from trip_level_features "return: mllib logistic regression model """ label_pt = RDD.map(lambda x: LabeledPoint(x[2], x[1])) model = LogisticRegressionWithLBFGS.train(label_pt) return model def train_err(model): """ :param model: mllib logistic regresion model :return: training error for model """ labelsAndPreds = label_pt.map(lambda x: (x.label, model.predict(x.features))) trainErr = labelsAndPreds.filter(lambda (x, y): x != y).count() / float(label_pt.count()) return "Training Error = " + str(trainErr) """
def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[1], [values[0]])
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features") lp = selectData.map(lambda x : LabeledPoint(x.label,x.features)) (trainingData, testData) = lp.randomSplit([0.6, 0.4]) model = NaiveBayes.train(trainingData,1.0) predictionAndLabel = testData.map(lambda p : (model.predict(p.features), p.label)) accuracy = 100 * predictionAndLabel.filter(lambda (x, v): x == v ).count() / testData.count() print accuracy fp = predictionAndLabel.filter(lambda (x, v): x == 1 ).filter(lambda(x,v): v==0).count() tp = predictionAndLabel.filter(lambda (x, v): x == v ).filter(lambda(x,v): v==1).count() totalpositive = predictionAndLabel.filter(lambda(x,v): v==1).count() recall = 100*tp/totalpositive precision = 100*tp/(tp+fp)
import numpy as np from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark import SparkContext, SparkConf from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint conf = SparkConf().setMaster("local").setAppName("Test") sc = SparkContext(conf=conf) sparse_data = [ LabeledPoint(0.0, Vectors.dense([1.0, 0.0])), LabeledPoint(1.0, Vectors.dense([0.0, 1.0])), LabeledPoint(0.0, Vectors.dense([10.0, 9.0])), LabeledPoint(1.0, Vectors.dense([9.0, 10.0])) ] sparse_data = [ LabeledPoint(0.0, Vectors.dense([1.0, 0.0])), LabeledPoint(1.0, Vectors.dense([0.0, 1.0])), LabeledPoint(0.0, Vectors.dense([10.0, 9.0])), LabeledPoint(1.0, Vectors.dense([9.0, 10.0])) ] rdd = sc.parallelize(sparse_data) model = LogisticRegressionWithSGD.train(rdd, iterations=10) rdd = rdd.map(lambda x:x.features) model.predict(rdd).saveAsTextFile("result/hdfs") sc.stop()
def parsePoint(line): values = line.split() return LabeledPoint( int(values[0]), DenseVector([int(x.split(':')[1]) for x in values[1:]]))