def modelWithLogisticRegression(trainingData, validationData): ##Train the model using Logistic Regression that employs Stochastic Gradient Descent ##with different sets of parameters (i.e the value of lambda and the learning step size. ##Return the LR model with best accuracy rate #eta = [0.1, 0.3, 0.5, 1.0, 5.0] regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.] bestLRModel = None bestAccuracy = 0 numOfIterations = 200 visualizationData = [] for regularizer in regularizationParamater: model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted)/totalValidationAds visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestLRModel = model return bestLRModel, visualizationData
def main(): MakePixelFileFromImages("./CarData/TrainImages/*pgm") sc = SparkContext(appName="Image Classifier 01") p = sc.textFile("pos.csv") n = sc.textFile("neg.csv") pFeatures = p.map(lambda image: image.split(",")) nFeatures = n.map(lambda image: image.split(",")) pExamples = pFeatures.map(lambda features: LabeledPoint(1, features)) nExamples = nFeatures.map(lambda features: LabeledPoint(0, features)) data = pExamples.union(nExamples) (trainingData, testData) = data.randomSplit([0.7,0.3]) trainingData.cache() model = LogisticRegressionWithSGD.train(trainingData) labels_and_predictions = testData.map(lambda image:(image.label, model.predict(image.features))) error_rate = labels_and_predictions.filter(lambda (val,pred): val!=pred).count() / float(testData.count()) print("************* RESULTS *******************") print("Error Rate: " + str(error_rate)) pickle.dump(model, open("imageModel.pk1","wb")) sc.stop()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def train_committee(train_features, test_features, size=5): committee = [] attempts = 0 max_attempts = size * 4 roc_threshold = 0.7 test_pairs_features = test_features.map(lambda p: process_batch(p, is_train=True)) test_labeled_pairs = test_pairs_features.map(to_labeled_point) while len(committee) < size and attempts < max_attempts: attempts += 1 pairs_features = train_features.map(lambda p: process_batch(p, is_train=True)) labeled_points = pairs_features.map(to_labeled_point).sample(True, 1) model = LogisticRegressionWithSGD.train(labeled_points) model.clearThreshold() scores_and_labels = test_labeled_pairs.map(lambda p: (model.predict(p.features), p.label)) metrics = BinaryClassificationMetrics(scores_and_labels) if metrics.areaUnderROC > roc_threshold: print(attempts, metrics.areaUnderROC) committee.append(model) return committee
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def logistic_l2_accuracy(x_train, x_test, regParam): # cache data to get reasonable speeds for methods like LogisticRegression and SVM xc = x_train.cache() # training logistic regression with L2 regularization model = LogisticRegressionWithSGD.train(xc, regParam=regParam, regType="l2") # making prediction on x_test yhat = x_test.map(lambda p: (p.label, model.predict(p.features))) # returning accuracy on x_test return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
def main(): """ Driver program for a spam filter using Spark and MLLib """ # Consolidate the individual email files into a single spam file # and a single ham file makeDataFileFromEmails( "data/spam_2/", "data/spam.txt") makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" ) # Create the Spark Context for parallel processing sc = SparkContext( appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "data/spam.txt" ) ham = sc.textFile( "data/ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets ( trainingData, testData ) = data.randomSplit( [0.7, 0.3] ) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) ) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() ) print( "*********** SPAM FILTER RESULTS **********" ) print( "\n" ) print( "Error Rate: " + str( error_rate ) ) print( "\n" ) # Serialize the model for presistance pickle.dump( model, open( "spamFilter.pkl", "wb" ) ) sc.stop()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def getLogisticRegressionModel(Train_Data): numIters = 10 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True return LogisticRegressionWithSGD.train(data = Train_Data, iterations = numIters, miniBatchFraction=0.1, step = stepSize, regParam = regParam, regType = regType, intercept = includeIntercept)
def logisticRegression(trainingRDD, trainingRDDHashed, testRDDHashed, iterations, minibatch, stepsize): # Train a Naive Bayes Model trainedModel = LogisticRegressionWithSGD.train( trainingRDD, iterations=iterations, miniBatchFraction=minibatch, regType="l2", intercept=True, regParam=0.1, step=stepsize) # Test on Validation and Test Sets resultsValidation = trainingRDDHashed.map( lambda l_v24: ( (l_v24[0], trainedModel.predict( l_v24[1])), 1)).map( lambda x_y25: ( checkState( x_y25[0]), x_y25[1])).reduceByKey(add).collectAsMap() resultsTest = testRDDHashed.map( lambda l_v26: ( (l_v26[0], trainedModel.predict( l_v26[1])), 1)).map( lambda x_y27: ( checkState( x_y27[0]), x_y27[1])).reduceByKey(add).collectAsMap() # Get Counts nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Create a dictionary of the Values resultsValidation = defaultdict(lambda: 0, resultsValidation) resultsTest = defaultdict(lambda: 0, resultsTest) # Get F-Score and Accuracy Values AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV) AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT) # Print Results print(' Results for Logistic Regression') print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV) print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT) # Return the Result List return AccuracyV, fScoreV, AccuracyT, fScoreT
def train_trend_model(self, model, data, i): self.logger.info('Start to train the direction model') rdd_data = self.sc.parallelize(data) if self.trend_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) elif self.trend_prediction_method == self.NAIVE_BAYES: model = NaiveBayes.train(rdd_data) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) elif self.trend_prediction_method == self.SVM: model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) return model
def main(input_file_path): print('=====>>>>>') print('ddd') data = sc.textFile(input_file_path) traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX') unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '') traning_data_pddf = create_pddf(traning_data_RDD) traning_data_df = sqlContext.createDataFrame(traning_data_pddf) print(traning_data_df.head()) parsed_data = rdd_to_labeled_point(traning_data_df.rdd) parsed_data.persist() # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])] logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100) labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)]) Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])).count() / float(parsed_data.count()) print("Training Accuracy on training data = " + str(Accuracy)) unseen_data_pddf = create_pddf(unseen_data_RDD) unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf) unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd) unseen_parsed_data.persist() file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w', encoding='utf-8') file.write('INDEX,GENDER\n') for data in unseen_parsed_data.collect(): file.write(str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n') # print(labels_and_preds.collect()) parsed_data.unpersist() unseen_parsed_data.unpersist() print('=====>>>>>') print('=====>>>>>') print('=====>>>>>') print('=====>>>>>')
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
all_types = [] for i in [str(i) for i in title.split(",")]: schema = all_types.append(StructField(i, StringType(), True)) schema = StructType(all_types) from pyspark.sql import Row from pyspark.mllib.classification import LogisticRegressionWithSGD from numpy import array from pyspark.mllib.regression import LabeledPoint D = 2 ** 24 def helper1(r): features = [] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_" + str(i) + fe[i]))) % D) target = float(r[-1]) ID = float(r[0]) return LabeledPoint(target, features) except: return LabeledPoint(0.0, [0.0] * 1932) new_rdd = rdd.filter(lambda i: len(i) == 1934) df = new_rdd.map(helper1) model = LogisticRegressionWithSGD.train(df) df.take(1)
splits = parsedData.randomSplit((0.9, 0.1)) train_set = splits[0] train_set.cache() test_set = splits[1] test_set.cache() #NBmodel = NaiveBayes.train(train_set) #NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) #findCoveragePercent(NB_socredLabel, 0.4) SVMSGDmodel = SVMWithSGD.train(train_set) SVMSGDmodel.clearThreshold() SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0)) LRSGDmodel = LogisticRegressionWithSGD.train(train_set) LRSGDmodel.clearThreshold() LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0)) LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set) LRLBFGSmodel.clearThreshold() LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.4)) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.8)) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 1.0)) def getAccumulatedPercentage(socredLabel): result = [] total = socredLabel.sum()
import sys from pyspark import SparkContext from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.regression import LabeledPoint def parsePoint(line): """ Parse a line of text into an MLlib LabeledPoint object. """ values = [float(s) for s in line.split(' ')] if values[0] == -1: # Convert -1 labels to 0 for MLlib values[0] = 0 return LabeledPoint(values[0], values[1:]) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: logistic_regression <file> <iterations>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonLR") points = sc.textFile(sys.argv[1]).map(parsePoint) iterations = int(sys.argv[2]) model = LogisticRegressionWithSGD.train(points, iterations) print("Final weights: " + str(model.weights)) print("Final intercept: " + str(model.intercept)) sc.stop()
def create_model_libsvm(self, data, params): numIterations = int(params.get('numIterations', 10)) return LogisticRegressionWithSGD.train(data, numIterations)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
# remove header header = ibm_rdd.first() ibm_data_rdd = ibm_rdd.filter(lambda x: x != header) \ .map(lambda x: x.split(',')) \ .map(lambda x: LabeledPoint(x[7],[x[8],x[9]])) ibm_data_rdd.take(5) # train and test model for 10 times lst_score = [] for i in range(10): ibm_train_rdd, ibm_test_rdd = ibm_data_rdd.randomSplit([.6, .4]) lrm = (LogisticRegressionWithSGD.train(ibm_train_rdd, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.01, regType='l2')) lst_predicted = (ibm_test_rdd.map(lambda x: x.features).map( lambda x: lrm.predict(x)).collect()) lst_truth = ibm_test_rdd.map(lambda x: x.label).collect() score = metrics.accuracy_score(lst_truth, lst_predicted) lst_score.append(score) print np.mean(lst_score) ################################################# ## demo 3: recommender system using ALS ######### #################################################
tfVectors = tf.transform(comment) idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) print(tfIdfVectors.take(3)) #需要用 RDD 的 zip 算子将这两部分数据连接起来,并将其转化为分类模型里的 LabeledPoint 类型 zip_score_comment = score.zip(tfIdfVectors) final_data = zip_score_comment.map(lambda line:LabeledPoint(line[0],line[1])) train_data,test_data = final_data.randomSplit([0.8,0.2],seed =0) print(train_data.take(1)) time_start = time.time() print(time_start) #SVMModel = SVMWithSGD.train(train_data,iterations=100) lrm = LogisticRegressionWithSGD.train(train_data,iterations=1000) time_end = time.time() cost_time = time_end - time_start print("spark_lr cost_time:",cost_time) predictionAndLabels = test_data.map(lambda t:(float(lrm.predict(t.features)),t.label)) print(predictionAndLabels.take(5)) metrics = MulticlassMetrics(predictionAndLabels) print('accuracy:',metrics.accuracy) print('precision:',metrics.weightedPrecision) print('recall:',metrics.weightedRecall) print('FMeasure:',metrics.weightedFMeasure())
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD data = [ LabeledPoint(0.0, [0.0, 1.0]), LabeledPoint(1.0, [1.0, 0.0]), ] lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10) lrm.predict([1.0, 0.0]) lrm.predict([0.0, 1.0]) lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect() lrm.clearThreshold() lrm.predict([0.0, 1.0])
def buildModel(trainrdd): model = LogisticRegressionWithSGD.train(trainrdd) #model = LinearRegressionWithSGD.train(trainrdd) return model
test_labels = test_labels_rdd.collect() ensemble_test = [] for i in range(0, len(test_labels), 1): l1 = [test_labels[i]] ensemble_test.append(l1) train_labels_rdd = train_data.map(lambda p: p.label) train_labels = train_labels_rdd.collect() ensemble_train = [] for i in range(0, len(train_labels), 1): l1 = [train_labels[i]] ensemble_train.append(l1) # C1 # Build the Model model = LogisticRegressionWithSGD.train(train_data) # Predict Labels c1_predict_labels_test_rdd = test_data.map(lambda p: (model.predict(p.features))) c1_predict_labels_train_rdd = train_data.map(lambda p: (model.predict(p.features))) c1_predict_labels_test = c1_predict_labels_test_rdd.collect() c1_predict_labels_train = c1_predict_labels_train_rdd.collect() # Append Labels appendColumn(ensemble_test, c1_predict_labels_test) appendColumn(ensemble_train, c1_predict_labels_train) # C2 # Build the Model
import numpy as np from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark import SparkContext, SparkConf from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint conf = SparkConf().setMaster("local").setAppName("Test") sc = SparkContext(conf=conf) sparse_data = [ LabeledPoint(0.0, Vectors.dense([1.0, 0.0])), LabeledPoint(1.0, Vectors.dense([0.0, 1.0])), LabeledPoint(0.0, Vectors.dense([10.0, 9.0])), LabeledPoint(1.0, Vectors.dense([9.0, 10.0])) ] sparse_data = [ LabeledPoint(0.0, Vectors.dense([1.0, 0.0])), LabeledPoint(1.0, Vectors.dense([0.0, 1.0])), LabeledPoint(0.0, Vectors.dense([10.0, 9.0])), LabeledPoint(1.0, Vectors.dense([9.0, 10.0])) ] rdd = sc.parallelize(sparse_data) model = LogisticRegressionWithSGD.train(rdd, iterations=10) rdd = rdd.map(lambda x:x.features) model.predict(rdd).saveAsTextFile("result/hdfs") sc.stop()
vectorize_start = time.time() vectorized_data = training_data.map(mapper_CF) vectorized_testing_data = testing_data.map(mapper_CF) """ train_instances = vectorized_data.count() test_instances = vectorized_testing_data.count() total_instances = train_instances + test_instances train_per = float(train_instances)/total_instances * 100 test_per = float(test_instances)/total_instances * 100 #""" vectorize_end = time.time() print "******************VECTORIZING: DONE********************" #building a logistic regression training model train_start = time.time() model = LogisticRegressionWithSGD.train(vectorized_data) train_end = time.time() print "******************MODEL TRAINING: DONE********************" #predicting classes for testing data and evaluating def mapper_predict(x): predicted_class = model.predict(x.features) #predicted_class = int(round(predicted_class)) actual_class = x.label return (actual_class, predicted_class) pred_start = time.time() actual_and_predicted = vectorized_testing_data.map(mapper_predict) count = actual_and_predicted.count() pred_end = time.time() print "******************PREDICTION: DONE********************"
def spark_create_model(data_size, file_path, store=False): """ Spark Model Creation """ # Set this variable to distinguish between logistic and linear regression REGRESSION_TYPE = 'logistic' sc = SparkContext(appName="SparkCreateModel") # load Twitter data if data_size == 'small': twitter_data = load_data_from_file( sc, "file:///root/mongoData/small_twitter.json") else: twitter_data = load_data_from_file( sc, "file:///root/mongoData/twitter.json") # load YouTube data if data_size == 'small': youtube_data = load_data_from_file( sc, "file:///root/mongoData/small_youtube.json") else: youtube_data = load_data_from_file( sc, "file:///root/mongoData/youtube.json") youtube_data = youtube_data.filter(filter_youtube_data) # load Facebook data if data_size == 'small': facebook_data = load_data_from_file( sc, "file:///root/mongoData/small_facebook.json") else: facebook_data = load_data_from_file( sc, "file:///root/mongoData/facebook.json") # Store the sentiment score for each data item sent_twitter_data = twitter_data.map(lambda x: get_sentiment(x, 'twitter')) sent_youtube_data = youtube_data.map(lambda x: get_sentiment(x, 'youtube')) sent_facebook_data = facebook_data.map( lambda x: get_sentiment(x, 'facebook')) #create MLLib LabeledPoints twitter_LP = sent_twitter_data.map( lambda x: create_labeled_points_twitter(x, REGRESSION_TYPE)) youtube_LP = sent_youtube_data.map( lambda x: create_labeled_points_youtube(x, REGRESSION_TYPE)) facebook_LP = sent_facebook_data.map( lambda x: create_labeled_points_facebook(x, REGRESSION_TYPE)) # split data in to training (80%) and test(20%) sets train_twitter, test_twitter = twitter_LP.randomSplit([0.8, 0.2], seed=0) train_youtube, test_youtube = youtube_LP.randomSplit([0.8, 0.2], seed=0) train_facebook, test_facebook = facebook_LP.randomSplit([0.8, 0.2], seed=0) #combine all 3 datasets with the RDD.union command train_LP = train_twitter.union(train_facebook).union(train_youtube) test_LP = test_twitter.union(test_facebook).union(test_youtube) # Build logistic regression model model_log = LogisticRegressionWithSGD.train(train_LP) if store == True: model_log.save(sc, file_path) # Evaluate the model on training data preds_train_log = train_LP.map(lambda p: (p.label, model_log.predict(p.features))) total_train = float(train_LP.count()) trainErr_log = preds_train_log.filter(lambda (v, p): v != p).count() / total_train # Evaluate the model on test data preds_test_log = test_LP.map(lambda p: (p.label, model_log.predict(p.features))) total_test = float(test_LP.count()) testErr_log = preds_test_log.filter(lambda (v, p): v != p).count() / total_test twitter_LP_count = twitter_LP.count() youtube_LP_count = youtube_LP.count() facebook_LP_count = facebook_LP.count() print('TWITTER LP COUNT %d' % (twitter_LP_count)) print('YOUTUBE LP COUNT %d' % (youtube_LP_count)) print('FACEBOOK LP COUNT %d' % (facebook_LP_count)) print("Train Error = " + str(trainErr_log)) print("Test Error = " + str(testErr_log)) print(model_log) sc.stop()
label = 0 values = [x if x < genre else x-1 for x in values] #shift the attributes by one index ones = [] ones = [1] * len(values) return LabeledPoint(label, SparseVector(column_num-1, values, ones)) #set hdfs path data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*") data = sc.sequenceFile("hdfs://localhost:9000/test/*") parsedData = data.filter(filterPoint).map(parsePoint).reduceByKey(lambda x, y : x + y).map(lambda (k, v) : list(set(v))) parsedData.cache() #Calculate total number of columns in the dataset column_num = parsedData.flatMap(lambda _ : _ ).distinct().count() column_id = parsedData.flatMap(lambda _ : _ ).distinct().collect() column_id.sort() #choose a genre to test, default is 100th column as target variable genre = 1 sortedData = parsedData.map(sortPoint) labeledData = sortedData.map(lambda line : (line, genre)).map(labelData) LRSGDmodel = LogisticRegressionWithSGD.train(labeledData) print LRSGDmodel.weights
def parse_interaction_chi(line): line_split = line.split(",") # leave_out = [1,2,3,19,20.41] clean_line_split = line_split[0:1] + line_split[4:19] + line_split[21:41] attack = 1.0 if line_split[41] == 'normal.': attack = 0.0 return LabeledPoint(attack, np.array([float(x) for x in clean_line_split])) training_data_chi = raw_data.map(parse_interaction_chi) test_data_chi = test_raw_data.map(parse_interaction_chi) t0 = time() logit_model_chi = LogisticRegressionWithSGD.train(training_data_chi) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt, 3)) labels_and_preds = test_data_chi.map( lambda p: (p.label, logit_model_chi.predict(p.features))) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float( test_data_chi.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format( round(tt, 3), round(test_accuracy, 4)) # ------------- RDD basics ------------------
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF from pyspark.mllib.classification import LogisticRegressionWithSGD conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) spam = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/spam.txt") normal = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/ham.txt") # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) normalFeatures = normal.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (normal) examples. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache() # Cache since Logistic Regression is an iterative algorithm. # Run Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Test on a positive example (spam) and a negative one (normal). We first apply # the same HashingTF feature transformation to get vectors, then apply the model. posTest = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) negTest = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) print "Prediction for positive test example: %g" % model.predict(posTest) print "Prediction for negative test example: %g" % model.predict(negTest)
sqlContext = SQLContext(sc) ### Prepare Undirected Data undirected_relation_df = sqlContext.read.load(undirected_relation_json_file, format="json") #.limit(100) undirectedParsedData = undirected_relation_df.map( lambda point: parsePoint(point)) print("@@@@@@@@@@@@@@@@@@@@@ 0 @@@@@@@@@@@@@@@@@@@@@@") print("undirectedParsedData == " + str(undirectedParsedData.take(10))) print("@@@@@@@@@@@@@@@@@@@@@ END0 @@@@@@@@@@@@@@@@@@@@@@") ### Build ModeL undirected undirectedModel = LogisticRegressionWithSGD.train(undirectedParsedData) undirectedModel.clearThreshold() undirectedModel.save(sc, mainPath + "undireced_relation_model") undirectedLabelsAndPreds = undirectedParsedData.map(lambda point: ( point.label, float(undirectedModel.predict(point.features)))) undirectedLabelsAndPredsIndexed = undirectedLabelsAndPreds.zipWithIndex().map( lambda (x, y): (y, x)) print("@@@@@@@@@@@@@@@@@@@@@ 0 @@@@@@@@@@@@@@@@@@@@@@") print("undirectedLabelsAndPredsIndexed == " + str(undirectedLabelsAndPredsIndexed.take(10))) print("@@@@@@@@@@@@@@@@@@@@@ END0 @@@@@@@@@@@@@@@@@@@@@@") ###################################################### #join with productsIds productsIds = undirected_relation_df.map(lambda point: getIds(point))
.map(lambda lp: len(lp.features.indices)) .sum()) Test.assertEquals(numNZVal, 372080, 'incorrect number of features') # ** CTR prediction and logloss evaluation ** from pyspark.mllib.classification import LogisticRegressionWithSGD # fixed hyperparameters numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True model0 = LogisticRegressionWithSGD.train(OHETrainData, numIters, stepSize, 1.0, None, regParam, regType, includeIntercept) sortedWeights = sorted(model0.weights) print sortedWeights[:5], model0.intercept # TEST Logistic regression Test.assertTrue(np.allclose(model0.intercept, 0.56455084025), 'incorrect value for model0.intercept') Test.assertTrue(np.allclose(sortedWeights[0:5], [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304, -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights') # ** Log loss ** from math import log def computeLogLoss(p, y):
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, training_fraction, jobname, random_seed=None): ### generate data folder and out folder, clean up if needed #local_out_dir = local_out_dir + "/" #if os.path.exists(local_out_dir): # shutil.rmtree(local_out_dir) # to keep smaplelist file if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # create zip files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat = 0 ml_opts = {} if not ml_opts_jstr is None: ml_opts = json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat = ml_opts["has_excluded_feat"] #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist = ml_util.ml_get_excluded_feat( row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=", excluded_feat_cslist # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file:", libsvm_data_file # load feature count file feat_count_file = libsvm_data_file + "_feat_count" feature_count = zip_feature_util.get_feature_count(sc, feat_count_file) print "INFO: feature_count=", feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist) # get distinct label list labels_list_all = samples_rdd.map( lambda p: p[0].label).distinct().collect() # split samples to training and testing data, format (LabeledPoint,hash) training_rdd, testing_rdd = samples_rdd.randomSplit( [training_fraction, 1 - training_fraction], seed=int(random_seed)) training_rdd = training_rdd.map(lambda p: p[0]) # keep LabeledPoint only training_rdd.cache() training_sample_count = training_rdd.count() training_lbl_cnt_list = training_rdd.map( lambda p: (p.label, 1)).reduceByKey(add).collect() testing_rdd.cache() testing_sample_count = testing_rdd.count() testing_lbl_cnt_list = testing_rdd.map( lambda p: (p[0].label, 1)).reduceByKey(add).collect() sample_count = training_sample_count + testing_sample_count # random_seed testing if not random_seed is None: all_t = testing_rdd.collect() all_t = sorted(all_t, key=lambda x: x[1]) cnt = 0 for i in all_t: print i[1] cnt = cnt + 1 if cnt > 3: break t1 = time() print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list print "INFO: labels_list_all=", labels_list_all print "INFO: training and testing samples generated!" print 'INFO: running time: %f' % (t1 - t0) t0 = t1 ############################################### ###########build learning model################ ############################################### ### get the parameters### print "INFO: ======Learning Algorithm and Parameters=============" #ml_opts = json.loads(ml_opts_jstr) model_name = ml_opts[ 'learning_algorithm'] # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd iteration_num = 0 if 'iterations' in ml_opts: iteration_num = ml_opts['iterations'] C = 0 if 'c' in ml_opts: C = eval(ml_opts['c']) regularization = "" if 'regularization' in ml_opts: regularization = ml_opts['regularization'] print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: iterations = ", iteration_num print "INFO: regType = ", regularization regP = C / float(training_sample_count) print "INFO: Calculated: regParam = ", regP ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: ''' key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] print "INFO: dic_list=",dic_list label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') ''' label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id) print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels:", labels_list class_num = len(labels_list) if class_num > 2: print "INFO: Multi-class classification! Number of classes = ", class_num ### build model ### if model_name == "linear_svm_with_sgd": ### 1: linearSVM print "INFO: ====================1: Linear SVM=============" model_classification = SVMWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) #print model_classification elif model_name == "logistic_regression_with_lbfgs": ### 2: LogisticRegressionWithLBFGS print "INFO: ====================2: LogisticRegressionWithLBFGS=============" model_classification = LogisticRegressionWithLBFGS.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num) # regParam = 1/(sample_number*C) elif model_name == "logistic_regression_with_sgd": ### 3: LogisticRegressionWithSGD print "INFO: ====================3: LogisticRegressionWithSGD=============" model_classification = LogisticRegressionWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) else: print "INFO: Training model selection error: no valid ML model selected!" return print "INFO: model type=", type(model_classification) # create feature coefficient file ================================ coef_arr = None intercept = None if model_classification.weights is None: print "WARNING: model weights not found!" else: coef_weights = model_classification.weights #print "coef_weights=",coef_weights #print type(coef_weights),coef_weights.shape coef_arr = coef_weights.toArray().tolist() # save coef_arr to mongo key = "coef_arr" ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples) # save coef_arr to local file if ret == 0: # drop old record in mongo filter = '{"rid":' + row_id_str + ',"key":"coef_arr"}' ret = query_mongo.delete_many(mongo_tuples, None, filter) if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) fn_ca = os.path.join(local_out_dir, row_id_str, row_id_str + "_coef_arr.pkl") print ml_util.ml_pickle_save(coef_arr, fn_ca) # save intercept to mongo intercept = model_classification.intercept key = "coef_intercept" ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples) # feature list + coef file ============= feat_filename = os.path.join(local_out_dir, row_id_str + "_feat_coef.json") print "INFO: feat_filename=", feat_filename # create feature, coef & raw string file =============================================== ============ # expect a dict of {"fid":(coef, feature_raw_string)} jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None, coef_arr, ds_id, mongo_tuples) # special featuring for IN or libsvm if jret is None: jret = ml_util.build_feat_coef_raw_list_t(row_id_str, feat_filename, coef_arr, ds_id, mongo_tuples) if jret is None: print "WARNING: Cannot create sample list for testing dataset. " jfeat_coef_dict = jret print "INFO: coef_arr len=", len( coef_arr), ", feature_count=", feature_count # for multi-class if len(coef_arr) != feature_count: jfeat_coef_dict = {} print "WARNING: coef count didn't match feature count. multi-class classification was not supported" # Calculate prediction and Save testing dataset bt_coef_arr = sc.broadcast(coef_arr) bt_intercept = sc.broadcast(intercept) bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict) ### Evaluating the model on testing dataset: label, predict label, score, feature list print "INFO: intercept=", intercept print "INFO: coef_arr len=", len(coef_arr), type(coef_arr) print "INFO: jfeat_coef_dict len=", len( jfeat_coef_dict) #, jfeat_coef_dict # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ============================== if len(coef_arr) == feature_count: testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \ ,p[0].features \ ,p[1] \ ) ).cache() else: # for multi-class, no prediction score; TBD for better solution: how to display multiple weights for each class testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,"-" \ ,p[0].features \ ,p[1] \ ) ).cache() ''',p[0].features.dot(bt_coef_arr.value)+bt_intercept.value \ # Save testing dataset for analysis libsvm_testing_output = hdfs_feat_dir + "libsvm_testing_output_"+row_id_str print "INFO: libsvm_testing_output=", libsvm_testing_output try: hdfs.rmr(libsvm_testing_output) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm_testing_output file clean up:", sys.exc_info()[0] # save only false prediction? #testing_pred_rdd.filter(lambda p: p[0] != p[1]).saveAsTextFile(libsvm_testing_output) testing_pred_rdd.saveAsTextFile(libsvm_testing_output) ''' #test_tmp=testing_pred_rdd.collect() # save false prediction to local file false_pred_fname = os.path.join(local_out_dir, row_id_str + "_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\ .map(lambda p: (p[0],p[1],p[2] \ ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value) ,p[4] ) ) \ .collect() print "INFO: false predicted count=", len(false_pred_data) false_pred_arr = [] with open(false_pred_fname, "w") as fp: for sp in false_pred_data: jsp = { "tlabel": sp[0], "plabel": sp[1], "score": sp[2], "feat": sp[3], "hash": sp[4] } #print "jsp=",jsp false_pred_arr.append(jsp) fp.write(json.dumps(false_pred_arr)) # save prediction results, format: label, prediction, hash pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname pred_out_arr = testing_pred_rdd.map(lambda p: (p[0], p[1], p[4])).collect() ml_util.ml_pickle_save(pred_out_arr, pred_ofname) ''' one_item= testing_pred_rdd.first() print "one_item=",one_item sparse_arr=one_item[3] dict_feat=zip_feature_util.sparseVector2dict(sparse_arr) print "len=",len(dict_feat),"dict_feat=",dict_feat dict_weit=zip_feature_util.add_coef2dict(coef_arr,dict_feat) print "len=",len(dict_weit),"dict_weit=",dict_weit ''' # Calculate Accuracy. labelsAndPreds = (true_label,predict_label) labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1])) labelsAndPreds.cache() testing_sample_number = testing_rdd.count() testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( testing_sample_number) accuracy = 1 - testErr print "INFO: Accuracy = ", accuracy ### Save model #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/' #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'), config.get('app', 'HDFS_MODEL_DIR'), row_id_str) try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "WARNING: I/O error({0}): {1}".format( e.errno, e.strerror), ". At HDFS=", save_dir except: print "WARNING: Unexpected error:", sys.exc_info( )[0], ". At HDFS=", save_dir model_classification.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) t1 = time() print 'INFO: training run time: %f' % (t1 - t0) t0 = t1 ############################################### ###########plot prediction result figure ==================================================== =============== ############################################### labels = labelsAndPreds.collect() true_label_list = [x for x, _ in labels] pred_label_list = [x for _, x in labels] pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png") true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png") pred_xlabel = 'Prediction (Single Run)' true_xlabel = 'True Labels (Single Run)' test_cnt_dic = ml_util.ml_plot_predict_figures( pred_label_list, true_label_list, labels_list, label_dic, testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname) print "INFO: figure files: ", pred_fname, true_fname #print "INFO: Number of samples in each label is=", test_cnt_dic roc_auc = None perf_measures = None dataset_info = { "training_fraction": training_fraction, "class_count": class_num, "dataset_count": sample_count } ############################################################# ###################for 2 class only (plot ROC curve) ==================================================== =============== ############################################################# if len(labels_list) == 2: do_ROC = True reverse_label_dic = dict((v, k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "INFO: No ROC curve generated: 'clean','benign' or '0' must be a label for indicating negative class!" do_ROC = False # build data file for score graph score_graph_fname = os.path.join(local_out_dir, row_id_str + "_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname # build score_arr_0, score_arr_1 # format: tlabel, plabel, score, libsvm, raw feat str, hash graph_arr = testing_pred_rdd.map(lambda p: (int(p[0]), float(p[2]))).collect() score_arr_0 = [] score_arr_1 = [] max_score = 0 min_score = 0 for p in graph_arr: if p[0] == 0: score_arr_0.append(p[1]) else: score_arr_1.append(p[1]) # save max,min score if p[1] > max_score: max_score = p[1] elif p[1] < min_score: min_score = p[1] ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name, score_graph_fname, max_score, min_score) if do_ROC: perf_measures = ml_util.calculate_fscore(true_label_list, pred_label_list) print "RESULT: perf_measures=", perf_measures ''' # calculate fscore ========== tp = labelsAndPreds.filter(lambda (v, p): v == 1 and p==1 ).count() fp = labelsAndPreds.filter(lambda (v, p): v == 0 and p==1 ).count() fn = labelsAndPreds.filter(lambda (v, p): v == 1 and p==0 ).count() tn = labelsAndPreds.filter(lambda (v, p): v == 0 and p==0 ).count() print "RESULT: tp=",tp,",fp=",fp,",fn=",fn,",tn=",tn precision=float(tp)/(tp+fp) recall=float(tp)/(tp+fn) print "RESULT: precision=",precision,",recall=",recall acc=(tp+tn)/(float(testing_sample_number)) fscore=2*((precision*recall)/(precision+recall)) print "RESULT: fscore=",fscore,",acc=",acc ''' model_classification.clearThreshold() scoreAndLabels = testing_rdd.map(lambda p: ( model_classification.predict(p[0].features), int(p[0].label))) #metrics = BinaryClassificationMetrics(scoreAndLabels) #areROC = metrics.areaUnderROC #print areROC scoreAndLabels_list = scoreAndLabels.collect() if flag_clean == 0: scores = [x for x, _ in scoreAndLabels_list] s_labels = [x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x, _ in scoreAndLabels_list] s_labels = [1 - x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] # create ROC data file ======== ==== roc_auc = ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P, local_out_dir, row_id_str) #, local_out_dir, file_name_given) perf_measures["roc_auc"] = roc_auc # only update db for web request ==================================================== =============== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" print 'INFO: Finished!' return 0
X_train = sc.parallelize(np.random.uniform(0,10,[nb_train,P])) w = np.random.uniform(size=[P+1,]) y_train = X_train.map(f = lambda a: w[0] + np.dot(a,w[1:])) y_train_mean = y_train.mean() y_train = y_train.map(f = lambda val: 1 if val > y_train_mean else 0) data_train = y_train.zip(X_train).map(f = lambda tu: LabeledPoint(tu[0],tu[1])) X_test = sc.parallelize(np.random.uniform(0,10,[nb_test,P])) y_test = X_test.map(f = lambda a: w[0] + np.dot(a,w[1:])) y_test_mean = y_test.mean() y_test = y_test.map(f = lambda val: 1 if val > y_test_mean else 0) t1 = time.time() lrm = LogisticRegressionWithSGD.train(data_train,iterations = 10) y_pred = lrm.predict(X_test) print "*******************************" nb_corr = np.sum(np.array(y_test.collect()) == np.array(y_pred.collect())) print nb_corr print "the accuracy is ", nb_corr/float(nb_test) print "*******************************" t2 = time.time() print "time elapsed spark logistic regression ", t2 - t1 lrm_bfgs = LogisticRegressionWithLBFGS.train(data_train) y_pred = lrm_bfgs.predict(X_test) print "*******************************"
return sc sc = getSparkContext() # Load and parse the data data = sc.textFile("/data.txt") def mapper(line): """ Mapper that converts an input line to a feature vector """ feats = line.strip().split(",") # labels must be at the beginning for LRSGD, it's in the end in our data, so # putting it in the right place label = feats[len(feats) - 1] feats = feats[: len(feats) - 1] feats.insert(0,label) features = [ float(feature) for feature in feats ] # need floats return LabeledPoint(label, features) parsedData = data.map(mapper) model = LogisticRegressionWithSGD.train(parsedData, iterations=100) labelsAndPreds = parsedData.map(lambda point: (point.label, model.predict(point.features))) trainErr = labelsAndPreds.filter(lambda p: p[0] != p[1]).count() / float(parsedData.count()) print("Training Error = " + str(trainErr))
# Imports # SGD = Stochastic Gradient Descent. Convex optimization to optimize objective functions. from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext from numpy import array sc = SparkContext("local", "SVM") # Loading and parsing data def parsePoint(line): vals = [float(i) for i in line.split(' ')] return LabeledPoint(vals[0], vals[1:]) # Sample data provided by Spark 1.3.1 folder data = sc.textFile("jingrong/sample_svm_data.txt") parsedData = data.map(parsePoint) # Building the model model = LogisticRegressionWithSGD.train(parsedData) # Evaluate the model based on training data labelAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainingError = labelAndPreds.filter(lambda (v, p): v != p).count() / float( parsedData.count()) print "Training Error: ", str(trainingError)
random_state=0) for train_index, test_index in ss: X_training, Y_training, X_test, Y_test = [], [], [], [] for i in train_index: X_training.append(X[i]) Y_training.append(Y[i]) for i in test_index: X_test.append(X[i]) Y_test.append(Y[i]) parsedData = [] for i in range(0, len(X_training)): parsedData.append(LabeledPoint(Y_training[i], X_training[i])) model = LogisticRegressionWithSGD.train(sc.parallelize(parsedData)) model.clearThreshold() probas = [] for i in range(0, len(X_test)): b = model.predict(X_test[i]) probas.append(b) # Compute ROC curve and area the curve tpr, fpr, thresholds = roc_curve(Y_test, probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')
if len(sys.argv) != 3: logger.error(USAGE) sys.exit(0) trainFile = sys.argv[1] testFile = sys.argv[2] sc = SparkContext("local", "SVM: Schizophrenia") trainData = sc.textFile(trainFile) # sc.parallelize( trainData ) train = trainData.map(parsePoint) train.persist() testData = sc.textFile(testFile) # sc.parallelize( testData ) test = testData.map(parsePoint) test.persist() model = LogisticRegressionWithSGD.train(train) labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features))) # accuracy = labelsAndPreds.filter(lambda (v, p): True if p == 1.0 else False ).count() / float(test.count()) error = labelsAndPreds.filter( lambda (v, p): int(v) != int(p)).count() / float(test.count()) with open("error.txt", "w") as f: f.write("Accuracy: {0}\n".format(1 - error)) f.write("Error: {0}\n".format(error)) labelsAndPreds.saveAsTextFile(str(time.time()) + ".txt")
#Test the accuracy of predicition and print the time taken start = timer() test_accuracy = trainingLabelAndPreds1.filter( lambda (v, p): v == p).count() / float(testData.count()) end = timer() elapsed = end - start print '\nPrediction made in: ', elapsed, 'seconds with LBFGS' print '\nTest Accuracy is: ', round(test_accuracy, 4) trainingError1 = trainingLabelAndPreds1.map( lambda (r1, r2): float(r1 != r2)).mean() print '\nLBFGS training error: ', trainingError1 elif modelSelection == 'sgd': start = timer() model2 = LogisticRegressionWithSGD.train(trainingData, iterations=50, intercept=True) end = timer() elapsed = end - start globalModel = model2 print '\nClassifier trained in ', elapsed, ' seconds with SGD' # Evaluate the training and test errors trainingLabelAndPreds2 = trainingData.map( lambda point: (point.label, model2.predict(point.features))) #Test the accuracy of predicition and print the time taken start = timer() test_accuracy = trainingLabelAndPreds2.filter( lambda (v, p): v == p).count() / float(testData.count()) end = timer()
from pyspark.mllib.regression import LabeledPoint from numpy import array import parse # Load and parse the data #def parsePoint(line): # Creating vector(array) with first input as y and others as xi's # values = [float(x) for x in line.split(',')] # return LabeledPoint(values[10], values[0:9]) sc = SparkContext("local[4]", "Logistic Regression") #Initialized SparkContext data = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.data") #Created an RDD parsedData = data.map(parse.parsePoint) #RDD Transformation on the input RDD which is string and converting them to labeled points and each labeled points is a tuple of float(label) and ndrarray(features) # Build the model model = LogisticRegressionWithSGD.train(parsedData) #Pass an RDD to "train" method of class LogisticRegressionwithSGD #Use model to create output #model.predict().collect() # in "predict" method we have to pass an array #Read Test data Testdata = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.test") parsedTestData = Testdata.map(parse.parsePoint) #predict result for each Test Data # Evaluating the model on training data labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features))) #Taking each array of the RDD of parsedTestData which is a tuple(LabeledPoint) and then calculating its label and features , p is an input to lambda function and p is a tuple point(a LabeledPoint) millis2 = int(round(time.time() * 1000)) print labelsAndPreds.collect() #Print testing Error
# 17 3.0 14075 0.042509 # 18 2.0 16520 0.049893 # 19 1.0 32130 0.097037 # 20 0.0 186485 0.563212 # Genre_Name|label| raw_Features| #Create labeledPoints from a Spark DataFrame using Pyspark training = training_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray())) test = test_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray())) #label features #========LogisticRegressionModel # Run training algorithm to build the model #lr_model = LogisticRegressionWithSGD.train(sc.parallelize(training), validateData=False) lr_model = LogisticRegressionWithSGD.train(training, validateData=False) # Compute raw scores on the test set predictionAndLabels = test.map(lambda lp: (float(lr_model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) metrics.confusionMatrix().toArray() #Overall statistics print("Recall = %s" % metrics.recall()) print("Precision = %s" % metrics.precision()) print("F1 measure = %s" % metrics.fMeasure()) print("Accuracy = %s" % metrics.accuracy) # Recall = 0.09641896742635338 # Precision = 0.09641896742635338
print(BASE_DATA_PATH) conf = (SparkConf().setMaster("local[2]").setAppName("Testing MLLib With DataFrame SQL")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # read the dataset df_test = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=",").options(header="true").load( BASE_DATA_PATH + '/test.csv') training = df_test.map(lambda row: LabeledPoint(row.IsClick, [float(row.SearchID), float(row.AdID), float(row.Position), float(row.HistCTR), float(row.Price)])) (trainingData, testData) = training.randomSplit([0.7, 0.3]) model = LogisticRegressionWithSGD.train(trainingData,iterations = 100,step=0.4) # Build the model model1 = SVMWithSGD.train(trainingData, iterations=100) # Evaluate the model on training data model2 = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto",
OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint( point, ctrOHEDict, numCtrOHEFeats)) ##create validation labeled points OHEValidationData.cache() # running first model with fixed hyperparameters numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True print "-------------logistic regression with gradient descent---------" model0 = LogisticRegressionWithSGD.train( data=OHETrainData, iterations=numIters, step=stepSize, regParam=regParam, regType=regType, intercept=includeIntercept) ##train model sortedWeights = sorted(model0.weights) print "------------/logistic regression with gradient descent---------" def computeLogLoss(p, y): epsilon = 10e-12 if (p == 0): p = p + epsilon elif (p == 1): p = p - epsilon
.map(lambda lp: len(lp.features.indices)) .sum()) Test.assertEquals(numNZVal, 372080, 'incorrect number of features') # CTR预估和对数损失函数评估,引用MLlib API from pyspark.mllib.classification import LogisticRegressionWithSGD numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True model0 = LogisticRegressionWithSGD.train(OHETrainData,iterations=numIters,step=stepSize,regParam=regParam,regType=regType,intercept=includeIntercept) sortedWeights = sorted(model0.weights) print sortedWeights[:5], model0.intercept Test.assertTrue(np.allclose(model0.intercept, 0.56455084025), 'incorrect value for model0.intercept') Test.assertTrue(np.allclose(sortedWeights[0:5], [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304, -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights') # log损失 from math import log def computeLogLoss(p, y): epsilon = 10e-12 if y == 1 :
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD def parsePoint(line): """ Parse a line of text into an MLlib LabeledPoint object. """ values = [float(s) for s in line.split(' ')] if values[0] == -1: # Convert -1 labels to 0 for MLlib values[0] = 0 return LabeledPoint(values[0], values[1:]) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: logistic_regression <file> <iterations>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonLR") points = sc.textFile(sys.argv[1]).map(parsePoint) iterations = int(sys.argv[2]) model = LogisticRegressionWithSGD.train(points, iterations) print("Final weights: " + str(model.weights)) print("Final intercept: " + str(model.intercept)) sc.stop()
OHETrainData = rawTrainData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create train labeled points OHETrainData.cache() ##cache OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create validation labeled points OHEValidationData.cache() # running first model with fixed hyperparameters numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True print "-------------logistic regression with gradient descent---------" model0 = LogisticRegressionWithSGD.train(data=OHETrainData, iterations=numIters, step=stepSize,regParam=regParam, regType=regType, intercept=includeIntercept) ##train model sortedWeights = sorted(model0.weights) print "------------/logistic regression with gradient descent---------" def computeLogLoss(p, y): epsilon = 10e-12 if (p==0): p = p + epsilon elif (p==1): p = p - epsilon if y == 1: z = -log(p) elif y == 0:
cutoff = float(nrock) / (nrock + nxrock) # recombine equalSampleData = labeledRock.union(labeledNotRock) equalSampleData = labeledData.filter(lambda p: random.random() < cutoff if p.label != 1.0 else True) # split data trainData, testData = randomSplit(equalSampleData, [0.9, 0.1]) trainData.map(lambda p: (p.label, p.features)).take(3) # train model model = LogisticRegressionWithSGD.train(trainData, intercept=False, iterations=10000) # model = LinearRegressionWithSGD.train(trainData, step = 0.1, iterations=1000) # model = SVMWithSGD.train(trainData, step=1, iterations=1000, intercept=True) # evaluate model # labelsAndPreds = testData.map(lambda p: (p.label, 1 if model.predict(p.features) > 0.5 else 0)) labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testData.count()) guess1 = labelsAndPreds.filter(lambda (v, p): p == 1) precision1 = guess1.filter(lambda (v, p): v == p).count() / float(guess1.count()) act1 = labelsAndPreds.filter(lambda (v, p): v == 1) recall1 = act1.filter(lambda (v, p): v == p).count() / float(act1.count())
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
label = int(trimmed[-1]) features = [convert_na_nb(r) for r in trimmed[4:-1]] return LabeledPoint(label, Vectors.dense(features)) data = records.map(dealwith) total_count = data.count() #为了朴素贝叶斯 nbdata = records.map(dealwithNB) #print data.first() numIterations = 10 maxTreeDepth = 5 #训练逻辑回归模型 lrModel = LogisticRegressionWithSGD.train(data, numIterations) #训练支持向量机模型 svmModel = SVMWithSGD.train(data, numIterations) #训练朴素贝叶斯模型 nbModel = NaiveBayes.train(nbdata) #训练决策树模型 #dtModel=DecisionTree.train(data, Algo.Classification, Entropy, maxDepth) dtModel = DecisionTree.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=maxTreeDepth, maxBins=32)
return log_loss # In[10]: # try fixed hyperparameters numIters = 500 stepSize = 1 regParam = 1e-6 regType = 'l2' includeIntercept = True model0 = LogisticRegressionWithSGD.train(rawTrainData, iterations=numIters, step=stepSize, miniBatchFraction=1.0, initialWeights=None, regParam=regParam, regType=regType, intercept=includeIntercept) print model0.weights, model0.intercept # In[11]: classOneFracTrain = (rawTrainData.map(lambda x: x.label) .reduce(lambda x, y: x+y))/rawTrainData.count() print classOneFracTrain logLossTrBase = (rawTrainData.map(lambda x: x.label) .map(lambda x: computeLogLoss(classOneFracTrain, x)) .reduce(lambda x, y: x+y))/rawTrainData.count()
def train_logistic_regression(trainRDD): return LogisticRegressionWithSGD.train(trainRDD, iterations=10)
def train(self, num_iterations=10): model = LogisticRegressionWithSGD.train( self._labeled_feature_vector_rdd(), num_iterations) return LogisticRegressionModel(model, self.feature_cols)
hashValidationData.cache() hashTestData = rawTestData.map(lambda x: parseHashPoint(x, numBucketsCTR)) hashTestData.cache() # =================================================== # train logistic regression model # =================================================== numIters = 100 stepSize = 10. regParam = 0. # no regularization regType = 'l2' includeIntercept = True model = LogisticRegressionWithSGD.train(hashTrainData, iterations=numIters, step=stepSize, regParam=regParam, regType=regType, intercept=includeIntercept) sortedWeights = sorted(model.weights) sys.stderr.write('\n Model Intercept: {0}'.format(model.intercept)) sys.stderr.write('\n Model Weights (Top 5): {0}\n'.format( sortedWeights[:5])) l_metrics = [] l_metrics.append(evaluateMetrics(model, hashTrainData, 'TRAIN')) l_metrics.append(evaluateMetrics(model, hashValidationData, 'VALIDATE')) l_metrics.append(evaluateMetrics(model, hashTestData, 'TEST')) sc.parallelize(l_metrics).saveAsTextFile(sys.argv[4])
def create_model(self, data, params): numIterations = int(params.get('numIterations', 10)) points = data.map(self.parsePoint) return LogisticRegressionWithSGD.train(points, numIterations)
return None #set hdfs path #data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*") data = sc.textFile( "hdfs://nameservice1/user/geap/warehouse/geap.db/user_hist_plain/year=2015/*/*/*/*" ) parsedData = data.filter(filterPoint).map(parsePoint).filter( lambda kv: kv != None).reduceByKey(lambda x, y: x + y).map( lambda (k, v): list(set(v))) parsedData.cache() #Calculate total number of columns in the dataset column_num = parsedData.flatMap(lambda _: _).distinct().count() column_id = parsedData.flatMap(lambda _: _).distinct().collect() column_id.sort() #choose a genre to test, default is 100th column as target variable genre = 1 sortedData = parsedData.map(sortPoint).filter(lambda p: p != None) labeledData = sortedData.map(lambda line: (line, genre)).map(labelData).filter( lambda p: p != None) LRSGDmodel = LogisticRegressionWithSGD.train(labeledData) print LRSGDmodel.weights
def anom_with_lr(): try: plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",") anom = pat_proc.filter(pat_proc.is_anomalous == 1) benign = pat_proc.filter(pat_proc.is_anomalous == 0) n_benign = benign.count() #Take a random sample of 50K from the unlabeled 100K sqlContext.registerFunction("my_random", lambda x: x - x + random()) sqlContext.registerDataFrameAsTable(benign, "benign") benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign") threshold = 50000/n_benign into_model = benign.filter(benign.random_number <= threshold) for_finding_more = benign.filter(benign.random_number > threshold) for_modeling = anom.unionAll(into_model.drop(into_model.random_number)) for_finding_more = for_finding_more.drop(for_finding_more.random_number) #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among #the selected 10,000, have probabilities around 0.05 print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count())) all_columns = for_modeling.columns features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])] categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway procedure_features = [x for x in features if (x not in categorical_features)] #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects. #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries: #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature. cat_feature_number = 0 dict_cat_features = {} for feature in categorical_features: agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() distinct_values = map(lambda row: row.asDict().values()[0], agvalues) distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values)) dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values] cat_feature_number += 1 for_modeling = for_modeling.rdd print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8 (train, test) = for_modeling.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) print("training_data.count() = " + str(training_data.count())) t0 = time() #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) t0 = time() predictions = model.predict(test_data.map(lambda p: p.features)) tt = time() - t0 print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features))) test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size) fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count() print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282 model.clearThreshold() for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK try: for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD. #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000 #for_finding_more.take(5) except EOFError: print("EOF handled") df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom']) df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement. df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has #probability of 0.86818, last one has probability 0.5928958 except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return for_finding_more
pos_file = "data/training_positif_clean.csv" neg_file = "data/training_negatif_clean.csv" training_idf = training_set(pos_file, neg_file) training = training_idf[0] idf = training_idf[1] test_file = "data/test_clean" + str(part) + ".csv" test = test_set(test_file, idf) print("\nDone : Tf-IDF training and test sets") ########################################################################### ######### Model Training ######### model_regression = LogisticRegressionWithSGD.train(training) print("Done : regression training ") ########################################################################### ######### Model Testing ######### #regression predictions_regression = model_regression.predict(test) num_pos_regression = predictions_regression.countByValue()[1.0] num_neg_regression = predictions_regression.countByValue()[0.0] print("\n== PREDICTION REGRESSION : ==\n") print("- Positive : ", num_pos_regression) print("- Negative : ", num_neg_regression) file.write(
# 6. Create LabeledPoint datasets for positive (spam) and negative (ham) examples. A LabeledPoint consists simply of a label and a features vector. positive_examples = spam_features.map( lambda features: LabeledPoint(1, features)) negative_examples = ham_features.map( lambda features: LabeledPoint(0, features)) # 7. Create training data and cache it since Logistic Regression is an iterative algorithm. Examine the training data with collect action. training_data = positive_examples.union(negative_examples) training_data.cache() training_data.collect() # 8. Run Logistic Regression using the SGD optimizer and then check the model contents. model = LogisticRegressionWithSGD.train(training_data) model # 9. Test on a positive example (which is a spam) and a negative one (which is a ham). Apply the same HashingTF feature transformation algorithm used on the training data. pos_example = tf.transform("No investment required".split(" ")) neg_example = tf.transform( "Data Science courses recommended for you".split(" ")) # 10. Now use the learned model to predict spam/ham for new emails. print "Prediction for positive test: %g" % model.predict(pos_example) # Prediction for positive test: 1 print "Prediction for negative test: %g" % model.predict(neg_example) # Prediction for negative test: 0
train_data_formatted = train_data.map(create_labeled_point) # Obtengo el minimo de cada feature para sumarlo train_min_feat = train_data_formatted.map(lambda x: x.features).reduce( lambda a, b: np.minimum(a, b)) # resto el minimo para tener todos feats positivos (+0) train_data_formatted_pos = train_data_formatted.map( lambda x: LabeledPoint(x.label, x.features - train_min_feat)) # Clasificacion usando dos modelos from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.classification import LogisticRegressionWithSGD modelNB = NaiveBayes.train(train_data_formatted_pos) modelLR = LogisticRegressionWithSGD.train(train_data_formatted_pos) # Leo datos de test test_full = sc.textFile('file:///home/cloudera/bank-additional.csv') test_data_raw = test_full.filter(lambda row: row != train_header) test_data = test_data_raw.map(lambda line: line.replace('"', '').replace( '\n', '').replace('\r', '').split(';')) test_data = test_data.filter(lambda vec: len(vec) == 21) test_data_formatted = test_data.map(create_labeled_point) test_data_features = test_data_formatted.map( lambda x: x.features - train_min_feat) test_data_true_label = test_data_formatted.map(lambda x: x.label).collect() print test_data_true_label[:100]
table1 = sc.textFile("/user/team322/junli_testFeature/*") def f1(line): line = str(line).replace('(','').replace(')','').replace('None','0') userID = line.split(',')[0] return userID user = table1.map(f1).collect() #select the users of validation data result6 = sc.textFile("/user/team322/junli_trainFeature/*") # Load and parse the data def parsePoint(line): line = str(line).replace('(','').replace(')','').replace('None','0') line = line.split(',') values = [float(x) for x in line[2:]] #select label Column and features Columns return LabeledPoint(values[0], values[1:]) parsedData = result6.map(parsePoint) # Build the model model = LogisticRegressionWithSGD.train(parsedData) result7 = sc.textFile("/user/team322/junli_testFeature/*") def testParsePoint(line): line = str(line).replace('(','').replace(')','').replace('None','0') line = line.split(',') values = [float(x) for x in line[1:]] #select label Column and features Columns return LabeledPoint(values[0], values[1:]) parsedData2 = result7.map(testParsePoint) preds = parsedData2.map(lambda p: model.predict(p.features)) #use the model to predict parsedData2 preds = preds.collect() #translate the result of predict into list userID = [] for i in xrange(len(preds)): #select users whose predict is 1 if preds[i] == 1: userID.append(user[i]) sc.parallelize(userID).saveAsTextFile('/user/team322/solution_v') #create a parallelized collection and save it t2 = time.ctime()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass