def main(): """ Driver program for a spam filter using Spark and MLLib """ # Create the Spark Context for parallel processing sc = SparkContext(appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20050311_spam_2.tar\\20050311_spam_2\\spam.txt" ) ham = sc.textFile( "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20030228_easy_ham.tar\\20030228_easy_ham\\ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures=10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map( lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map( lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict(email.features))) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred).count() / float(testData.count()) # End the Spark Context sc.stop() # Print out the error rate print("*********** SPAM FILTER RESULTS **********") print("\n") print("Error Rate: " + str(error_rate)) print("\n") # Serialize the model for presistance pickle.dump(model, open("spamFilter.pkl", "wb"))
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def trainevaluatemodel_logit(model,traindata,validationdata, iterations, step, minibatchfraction,regparam): starttime=time() model=LogisticRegressionWithSGD.train(traindata, iterations=iterations, step=step, miniBatchFraction=minibatchfraction, initialWeights=None, regParam=regparam, regType='l2', intercept=False, validateData=True, convergenceTol=0.001) index=evaluation2(model,validationdata) duration=time()-starttime print('Param:'+'\n'+'iterations:'+str(iterations)+'\n'+'step:'+str(step)+'\n'+'minibatchfraction:'+str(minibatchfraction)+'\n'+'regparam:'+str(regparam)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index)) return (iterations, step, minibatchfraction,regparam,duration,index)
def main(): MakePixelFileFromImages("./CarData/TrainImages/*pgm") sc = SparkContext(appName="Image Classifier 01") p = sc.textFile("pos.csv") n = sc.textFile("neg.csv") pFeatures = p.map(lambda image: image.split(",")) nFeatures = n.map(lambda image: image.split(",")) pExamples = pFeatures.map(lambda features: LabeledPoint(1, features)) nExamples = nFeatures.map(lambda features: LabeledPoint(0, features)) data = pExamples.union(nExamples) (trainingData, testData) = data.randomSplit([0.7,0.3]) trainingData.cache() model = LogisticRegressionWithSGD.train(trainingData) labels_and_predictions = testData.map(lambda image:(image.label, model.predict(image.features))) error_rate = labels_and_predictions.filter(lambda (val,pred): val!=pred).count() / float(testData.count()) print("************* RESULTS *******************") print("Error Rate: " + str(error_rate)) pickle.dump(model, open("imageModel.pk1","wb")) sc.stop()
def modelWithLogisticRegression(trainingData, validationData): ##Train the model using Logistic Regression that employs Stochastic Gradient Descent ##with different sets of parameters (i.e the value of lambda and the learning step size. ##Return the LR model with best accuracy rate #eta = [0.1, 0.3, 0.5, 1.0, 5.0] regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.] bestLRModel = None bestAccuracy = 0 numOfIterations = 200 visualizationData = [] for regularizer in regularizationParamater: model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted)/totalValidationAds visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestLRModel = model return bestLRModel, visualizationData
def trainAndTestLG(train_lbl_vec, test_lbl_vec, regParam, lastTime): # create LabeledPoints for training lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l)) # train the model #categoricalFeaturesInfo={} # no categorical features model = LogisticRegressionWithSGD.train(lblPnt, miniBatchFraction=0.1, regType='l1', intercept=True, regParam=regParam) # evaluate training resultsTrain = lblPnt.map(lambda lp: (lp.label, model.predict(lp.features))) resultMap = resultsTrain.countByValue() # print 'TRAIN ' trainAccuracy = accuracy(resultMap) # test the model data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l)) resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features))) resultMapTest = resultsTest.countByValue() testAccuracy = accuracy(resultMapTest) thisTime = time() elapsedTime = thisTime - lastTime return [elapsedTime, trainAccuracy, testAccuracy]
def modelWithLogisticRegression(trainingData, validationData): ##Train the model using Logistic Regression that employs Stochastic Gradient Descent ##with different sets of parameters (i.e the value of lambda and the learning step size. ##Return the LR model with best accuracy rate #eta = [0.1, 0.3, 0.5, 1.0, 5.0] regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.] bestLRModel = None bestAccuracy = 0 numOfIterations = 200 visualizationData = [] for regularizer in regularizationParamater: model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted) / totalValidationAds visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestLRModel = model return bestLRModel, visualizationData
def train_committee(train_features, test_features, size=5): committee = [] attempts = 0 max_attempts = size * 4 roc_threshold = 0.7 test_pairs_features = test_features.map(lambda p: process_batch(p, is_train=True)) test_labeled_pairs = test_pairs_features.map(to_labeled_point) while len(committee) < size and attempts < max_attempts: attempts += 1 pairs_features = train_features.map(lambda p: process_batch(p, is_train=True)) labeled_points = pairs_features.map(to_labeled_point).sample(True, 1) model = LogisticRegressionWithSGD.train(labeled_points) model.clearThreshold() scores_and_labels = test_labeled_pairs.map(lambda p: (model.predict(p.features), p.label)) metrics = BinaryClassificationMetrics(scores_and_labels) if metrics.areaUnderROC > roc_threshold: print(attempts, metrics.areaUnderROC) committee.append(model) return committee
def processData(sc): #load and parse the data raw_data = sc.textFile(DATA_FILE) raw_data.persist() print "Train data size {}".format(raw_data.count()) # map data to a format needed for logistic regression parsedData = raw_data.map(mapper) print "Sample of input to algorithm ", parsedData.take(10) # Train model t0 = time() model = LogisticRegressionWithSGD.train(parsedData) t1 = time() - t0 print "Classifier trained in {} seconds".format(round(t1, 3)) labelsAndPreds = parsedData.map( lambda point: (point.label, model.predict(point.features))) # Evaluating the model on training data trainErr = labelsAndPreds.filter(lambda (v, p): v == p).count() / float( parsedData.count()) # Print some stuff print("Training Error = " + str(trainErr)) print "*************************** TESTING NOW ***********************" preds = parsed_test_data.map(lambda point: model.predict(point)) with open('/home/ashish/Desktop/preds.pickle', 'wb') as f: pickle.dump(preds.collect(), f)
def trainEvaluateModel(trainData, validationData, numIterationsParm, stepSizeParm, miniBatchFractionParm): ''' 训练模型时会输入不同的参数。其中,DecisionTree参数有impurity、maxDepth、maxBins等的值都会影响准确率以及训练所需的时间。 我们以图表显示这些参数值、准确率与训练所需的时间。 我们每次只会评估单个参数的不同值,例如评估maxDepth参数的不同值[3, 5, 10, 15, 20, 25],执行步骤如下: (1)用LogisticRegressionWithSGD.train进行训练传入trainData与单个参数的不同数值; (2)建立模型后,用validationData评估模型的AUC准确率; (3)训练与评估模型重复执行多次,产生多个参数项的AUC与运行时间,并存储于metricsRDD中; (4)全部执行完成后,将metricsRDD转换为Pandas DataFrame; (5)Pandas DataFrame可绘制AUC与运行时间图表,用于显示不同参数的准确率与执行时间的关系。 :param trainData: :param validationData: :param numIterationsParm: :param stepSizeParm: :param miniBatchFractionParm: :return: ''' print('======================= 训练评估模型 =======================') startTime = time() model = LogisticRegressionWithSGD.train(trainData, numIterationsParm, stepSizeParm, miniBatchFractionParm) AUC = evaluateModel(model, validationData) duration = time() - startTime print('========== [trainEvaluateModel] >>>> 训练评估模型:使用参数:numIterations=' + str(numIterationsParm) + ', stepSize=' + str(stepSizeParm) + ', miniBatchFraction=' + str(miniBatchFractionParm) + '\n' + '\t\t==>> 所需时间=' + str(duration) + ', 结果AUC=' + str(AUC)) return (AUC, duration, numIterationsParm, stepSizeParm, miniBatchFractionParm, model)
def trainEvaluateModel(trainData, validationData, numIterations, stepSize, miniBatchFraction): startTime = time() model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, miniBatchFraction) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:numIterations->", numIterations, ", stepSize->", stepSize, ", miniBatchFraction->", miniBatchFraction) print("==> 所需时间:", duration, "s ,AUC=", AUC) return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
def trian_model(spam,nospam): spam_features = tf.transform(spam) spam_label = spam_features.map(lambda f: LabeledPoint(1,f)) nospam_features = tf.transform(nospam) nospam_label = nospam_features.map(lambda f: LabeledPoint(0,f)) train_data = spam_label.union(nospam_label) model = LogisticRegressionWithSGD.train(train_data) return model
def logistic_l2_accuracy(x_train, x_test, regParam): # cache data to get reasonable speeds for methods like LogisticRegression and SVM xc = x_train.cache() # training logistic regression with L2 regularization model = LogisticRegressionWithSGD.train(xc, regParam=regParam, regType="l2") # making prediction on x_test yhat = x_test.map(lambda p: (p.label, model.predict(p.features))) # returning accuracy on x_test return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
def main(sc): train_data = sc.textFile("input/ctc_data.txt").map(parsePoint) parsedTrainData = train_data.randomSplit(weights=[0.2, 0.8]) start = time.time() model = LogisticRegressionWithSGD.train(parsedTrainData) end = time.time() time_elapsed = end - start output = "\nusing SGD " + str(time_elapsed) print output
def trainEvaluateModel(trainData, validationData, numInterations, stepSize, minibatchFaction): startTime = time() model = LogisticRegressionWithSGD.train(trainData,numInterations,stepSize,minibatchFaction) # model = LogisticRegressionWithSGD.train(trainData) # model = LogisticRegressionWithLBFGS(trainData,numInterations,stepSize,minibatchFaction) AUC = evaluateModel(model,validationData) durintation = time() - startTime print 'durintation' + str(durintation) return (AUC, numInterations, stepSize, minibatchFaction, model)
def lr(trainingData,testData,trainingSize,testSize): ''' linear lr classifier ''' # train a lr model numIterValList = [100,200] regParamValList = [0.01,0.1,1,10,100] stepSizeValList = [0.1,0.5,1] regTypeValList = ['l2','l1'] # variable for the best parameters bestNumIterVal = 200 bestRegParamVal = 0.01 bestStepSizeVal = 1 bestRegTypeVal = 'l2' bestTrainErr = 100 for numIterVal,regParamVal,stepSizeVal,regTypeVal in itertools.product(numIterValList,regParamValList,stepSizeValList,regTypeValList): model = LogisticRegressionWithSGD.train(trainingData, iterations=numIterVal, regParam=regParamVal, step=stepSizeVal, regType=regTypeVal) labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize) if trainErr<bestTrainErr: bestNumIterVal = numIterVal bestRegParamVal = regParamVal bestStepSizeVal = stepSizeVal bestRegTypeVal = regTypeVal bestTrainErr = trainErr print numIterVal,regParamVal,stepSizeVal,regTypeVal,trainErr print bestNumIterVal,bestRegParamVal,bestStepSizeVal,bestRegTypeVal,bestTrainErr model = LogisticRegressionWithSGD.train(trainingData, iterations=bestNumIterVal, regParam=bestRegParamVal, step=bestStepSizeVal, regType=bestRegTypeVal) # Evaluating the model on training data labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize) print trainErr # Evaluating the model on training data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testSize) print testErr pass
def LR_train(data): data_train = split_data(data) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_LR = LogisticRegressionWithSGD.train(training, 10) predictionAndlabel = test.map( lambda x: (float(model_LR.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_LR:%f" % accuracy) return model_LR, accuracy
def main(): """ Driver program for a spam filter using Spark and MLLib """ # Consolidate the individual email files into a single spam file # and a single ham file makeDataFileFromEmails( "data/spam_2/", "data/spam.txt") makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" ) # Create the Spark Context for parallel processing sc = SparkContext( appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "data/spam.txt" ) ham = sc.textFile( "data/ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets ( trainingData, testData ) = data.randomSplit( [0.7, 0.3] ) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) ) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() ) print( "*********** SPAM FILTER RESULTS **********" ) print( "\n" ) print( "Error Rate: " + str( error_rate ) ) print( "\n" ) # Serialize the model for presistance pickle.dump( model, open( "spamFilter.pkl", "wb" ) ) sc.stop()
def task2(): #Print title with Machine Learning Classification print("-------------------------------------------") startTitle = time.time() regex1 = re.compile(".*(title:).*") find1 = [m.group(0) for l in data for m in [regex1.search(l)] if m] title = [i.split('title: ', 1)[1] for i in find1] Programming = sc.textFile(fileProgramming) Other = sc.textFile(fileOther) # Create a HashingTF instance to map title text to vectors of 100,000 features. tf = HashingTF(numFeatures=100000) # Each title is split into words, and each word is mapped to one feature. programmingFeatures = Programming.map( lambda title: tf.transform(title.split(" "))) otherFeatures = Other.map(lambda title: tf.transform(title.split(" "))) # Create LabeledPoint datasets for positive (programming) and negative (other) examples. positiveExamples = programmingFeatures.map( lambda features: LabeledPoint(1, features)) negativeExamples = otherFeatures.map( lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache() # Run Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) listResult = [] for row in title: test = tf.transform(row.split(" ")) result = "null" if model.predict(test) == 1: result = "Programmings" else: result = "Non-Programming" joinResult = row + " = " + result listResult.append(joinResult) for i in listResult: if 'Non-Programming' in i: print(i) for i in listResult: if 'Programmings' in i: print(i) endTitle = time.time() elapsedTitle = endTitle - startTitle print(elapsedTitle) print("-------------------------------------------")
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def update(rdd): # LogisticRegressionWithSGD.train raises an error for an empty RDD. if not rdd.isEmpty(): self._model = LogisticRegressionWithSGD.train( rdd, self.numIterations, self.stepSize, self.miniBatchFraction, self._model.weights, regParam=self.regParam, convergenceTol=self.convergenceTol)
def main(sc): data = [ LabeledPoint(0.0, [0.0, 1.0]), LabeledPoint(1.0, [1.0, 0.0]) ] lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10) print (lrm.predict([1.0, 0.0])) print(lrm.predict([0.0, 1.0])) # Save and load model lrm.save(sc, "lrsgd") sameModel = LogisticRegressionModel.load(sc, "lrsgd") print(sameModel.predict([1.0, 0.0])) print(sameModel.predict([0.0, 1.0]))
def ml_lost(): from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import SVMWithSGD from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.ml.classification import LogisticRegression from pyspark import SparkContext lost_array = [] attr_list = get_attr_list() print "get attr list=%s" % str(attr_list) lost_sum = 0 for user in role_detail_dict.values(): if user.roleid == 0: continue #每个用户的各个属性的list ratio_array = normalize_detail(user, attr_list) unlost_value = 1 if user.is_lost: unlost_value = -1 lost_sum += 1 lost_array.append(LabeledPoint(unlost_value, ratio_array)) sc = SparkContext(appName="lost_statis") sc.setLogLevel('ERROR') #svm = SVMWithSGD.train(sc.parallelize(lost_array,2), iterations=10) parall = sc.parallelize(lost_array) svm = SVMWithSGD.train(parall, iterations=10) print svm svm_weight = list(getattr(svm, "_coeff")) svm_weight_dict = {} seq = 0 print "======svm weight len==%d" % svm_weight.__len__() for attr in attr_list: svm_weight_dict[attr] = svm_weight[seq] seq += 1 print svm_weight_dict lrm = LogisticRegressionWithSGD.train(parall, iterations=10) print lrm lrm_weight = list(getattr(lrm, "_coeff")) lrm_weight_dict = {} print "======lrm weight len==%d" % lrm_weight.__len__() seq = 0 for attr in attr_list: lrm_weight_dict[attr] = lrm_weight[seq] seq += 1 print lrm_weight_dict all_detail_user = role_detail_dict.__len__() - 1 print "lost_rate=%f:all user num=%d:create_role_dict=%d" % ( lost_sum / float(all_detail_user), all_detail_user, create_role_dict.__dict__()) sc.stop()
def TrainLRModel(trainData, iterations, step, miniBatchFraction): # Logistic Regression srcFeatures = trainData.map(lambda line: line.features) print srcFeatures.first() scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures) srcLabel = trainData.map(lambda line: line.label) scaledFeature = scaler.transform(srcFeatures) print scaledFeature.first() scaledData = srcLabel.zip(scaledFeature) trainData = scaledData.map( lambda (label, features): LabeledPoint(label, features)) model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \ miniBatchFraction = miniBatchFraction) return model
def trainEvaluateModel(trainData, validationData, numIterations, stepSize, miniBatchFraction): startTime = time() model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, miniBatchFraction) AUC = evaluateModel(model, validationData) duration = time() - startTime print "訓練評估:使用參數" + \ " numIterations="+str(numIterations) +\ " stepSize="+str(stepSize) + \ " miniBatchFraction="+str(miniBatchFraction) +\ " 所需時間="+str(duration) + \ " 結果AUC = " + str(AUC) return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
def TrainEvaluateModel(trainData, validationData, numIterations, stepSize, miniBatchFraction): startTime = time() model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, miniBatchFraction) AUC = EvaluateModel(model, validationData) duration = time() - startTime print("Evaluate the model: use the params: " + \ "numIterations=" + str(numIterations) + \ " stepSize" + str(stepSize) + \ " miniBatchFraction=" + str(miniBatchFraction) + "\n" + \ "====> duration time = " + str(duration) + \ " result AUC = " + str(AUC)) return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
def trainEvaluationModel(trainData, validationData, numIterations, stepSize, maxBatchFraction): startTime = time() # numIterations:使用随机梯度下降法的迭代次数 # stepSize:梯度下降的步长 # maxBatchFraction:每次迭代参数计算的样本比例,数值在0~1之间,默认为1 model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, maxBatchFraction) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:使用参数 " + \ " numIterations = " + str(numIterations) + \ " stepSize = " + str(stepSize) + \ " maxBatchFraction = " + str(maxBatchFraction) + \ " ==> 所需时间 = " + str(duration) + " 秒"\ " 结果 AUC = " + str(AUC)) return AUC, duration, numIterations, stepSize, maxBatchFraction, model
def trainEvaluateModel(trainData, validationData, numIterations, stepSize, miniBatchFraction): starttime = time() # 方法过时,记得换新方法 model = LogisticRegressionWithSGD.train( data=trainData, iterations=numIterations, step=stepSize, miniBatchFraction=miniBatchFraction) AUC = evaluateModel(model, validationData) duration = time() - starttime print("训练评估使用参数:\n", "numIterations=", numIterations, "\n stepSize=", stepSize, "\n miniBatchFraction=", miniBatchFraction, "====>用时=", duration, "\n 结果AUC=", AUC) return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
def getLogisticRegressionModel(Train_Data): numIters = 10 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True return LogisticRegressionWithSGD.train(data=Train_Data, iterations=numIters, miniBatchFraction=0.1, step=stepSize, regParam=regParam, regType=regType, intercept=includeIntercept)
def main(input_file_path): print('=====>>>>>') print('ddd') data = sc.textFile(input_file_path) traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX') unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '') traning_data_pddf = create_pddf(traning_data_RDD) traning_data_df = sqlContext.createDataFrame(traning_data_pddf) print(traning_data_df.head()) parsed_data = rdd_to_labeled_point(traning_data_df.rdd) parsed_data.persist() # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])] logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100) labels_and_preds = parsed_data.map( lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)]) Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1]) ).count() / float(parsed_data.count()) print("Training Accuracy on training data = " + str(Accuracy)) unseen_data_pddf = create_pddf(unseen_data_RDD) unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf) unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd) unseen_parsed_data.persist() file = open( '/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w', encoding='utf-8') file.write('INDEX,GENDER\n') for data in unseen_parsed_data.collect(): file.write( str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n') # print(labels_and_preds.collect()) parsed_data.unpersist() unseen_parsed_data.unpersist() print('=====>>>>>') print('=====>>>>>') print('=====>>>>>') print('=====>>>>>')
def getLogisticRegressionModel(Train_Data): numIters = 10 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True return LogisticRegressionWithSGD.train(data = Train_Data, iterations = numIters, miniBatchFraction=0.1, step = stepSize, regParam = regParam, regType = regType, intercept = includeIntercept)
def logisticRegression(trainingRDD, trainingRDDHashed, testRDDHashed, iterations, minibatch, stepsize): # Train a Naive Bayes Model trainedModel = LogisticRegressionWithSGD.train( trainingRDD, iterations=iterations, miniBatchFraction=minibatch, regType="l2", intercept=True, regParam=0.1, step=stepsize) # Test on Validation and Test Sets resultsValidation = trainingRDDHashed.map( lambda l_v24: ( (l_v24[0], trainedModel.predict( l_v24[1])), 1)).map( lambda x_y25: ( checkState( x_y25[0]), x_y25[1])).reduceByKey(add).collectAsMap() resultsTest = testRDDHashed.map( lambda l_v26: ( (l_v26[0], trainedModel.predict( l_v26[1])), 1)).map( lambda x_y27: ( checkState( x_y27[0]), x_y27[1])).reduceByKey(add).collectAsMap() # Get Counts nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Create a dictionary of the Values resultsValidation = defaultdict(lambda: 0, resultsValidation) resultsTest = defaultdict(lambda: 0, resultsTest) # Get F-Score and Accuracy Values AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV) AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT) # Print Results print(' Results for Logistic Regression') print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV) print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT) # Return the Result List return AccuracyV, fScoreV, AccuracyT, fScoreT
def main(sc): train_data = sc.textFile( "/data/scratch/vw/criteo-display-advertising-dataset/train.txt").map( parsePoint) model = LogisticRegressionWithSGD.train(train_data, iterations=1000, miniBatchFraction=0.0001, step=.001, regType="l2") valid_data = sc.textFile("input/valid_data.txt").map(parsePoint) labelsAndPreds = valid_data.map( lambda p: (float(model.predict(p.features)), p.label)) Accuracy = labelsAndPreds.filter( lambda (pred, lab): lab == pred).count() / float(valid_data.count()) FP = labelsAndPreds.filter(lambda (pred, lab): lab == 0 and pred == 1).count() N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count()) FPR = FP / N output = "Accuracy valid = " + str(Accuracy) + "\nFPR valid = " + str(FPR) print output metrics = BinaryClassificationMetrics(labelsAndPreds) output += "Area under ROC valid = " + str(metrics.areaUnderROC) print output test_data = sc.textFile( "/data/scratch/vw/criteo-display-advertising-dataset/test.txt").map( parsePoint) labelsAndPreds = test_data.map(lambda p: (float(model.predict(p.features)), p.label)) Accuracy = labelsAndPreds.filter( lambda (pred, lab): lab == pred).count() / float(test_data.count()) FP = labelsAndPreds.filter(lambda (pred, lab): lab == 0 and pred == 1).count() N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count()) FPR = FP / N output += "\nAccuracy test = " + str(Accuracy) + "\nFPR test = " + str(FPR) print output metrics = BinaryClassificationMetrics(labelsAndPreds) output += "Area under ROC test = " + str(metrics.areaUnderROC) print output output = sc.parallelize([output]) output.saveAsTextFile("str")
def train_trend_model(self, model, data, i): self.logger.info('Start to train the direction model') rdd_data = self.sc.parallelize(data) if self.trend_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) elif self.trend_prediction_method == self.NAIVE_BAYES: model = NaiveBayes.train(rdd_data) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) elif self.trend_prediction_method == self.SVM: model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) return model
def predict_LogisticRegressionWithSGD(iterations, step, regParam, regType): """ LogisticRegressionWithLBFGS.train(data, iterations=100, initialWeights=None, regParam=0.0, regType='l2', intercept=False, corrections=10, tolerance=1e-06, validateData=True, numClasses=2) data: the training data, an RDD of LabeledPoint iterations: the number of iterations corrections: the number of corrections used in the LBFGS update. if a known updater is used for binary classification, it calls the ml implementation and this parameter will have no effect. default 10 tolerance: the convergence tolerance of iterations for L-BFGS numClasses: the number of classes (i.e., outcomes) a label can take in Multinomial logistic regression, default 2 """ lrModel = LogisticRegressionWithSGD.train(scaledData, iterations=iterations, step=step, regParam=regParam, regType=regType) lrMetrics = scaledData.map(lambda p: (p.label, lrModel.predict(p.features))) lrAccuracy = lrMetrics.filter( lambda (actual, pred): actual == pred).count() * 1.0 / data.count() return lrAccuracy
def main(input_file_path): print('=====>>>>>') print('ddd') data = sc.textFile(input_file_path) traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX') unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '') traning_data_pddf = create_pddf(traning_data_RDD) traning_data_df = sqlContext.createDataFrame(traning_data_pddf) print(traning_data_df.head()) parsed_data = rdd_to_labeled_point(traning_data_df.rdd) parsed_data.persist() # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])] logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100) labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)]) Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])).count() / float(parsed_data.count()) print("Training Accuracy on training data = " + str(Accuracy)) unseen_data_pddf = create_pddf(unseen_data_RDD) unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf) unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd) unseen_parsed_data.persist() file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w', encoding='utf-8') file.write('INDEX,GENDER\n') for data in unseen_parsed_data.collect(): file.write(str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n') # print(labels_and_preds.collect()) parsed_data.unpersist() unseen_parsed_data.unpersist() print('=====>>>>>') print('=====>>>>>') print('=====>>>>>') print('=====>>>>>')
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def anom_with_lr(): try: plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",") anom = pat_proc.filter(pat_proc.is_anomalous == 1) benign = pat_proc.filter(pat_proc.is_anomalous == 0) n_benign = benign.count() #Take a random sample of 50K from the unlabeled 100K sqlContext.registerFunction("my_random", lambda x: x - x + random()) sqlContext.registerDataFrameAsTable(benign, "benign") benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign") threshold = 50000/n_benign into_model = benign.filter(benign.random_number <= threshold) for_finding_more = benign.filter(benign.random_number > threshold) for_modeling = anom.unionAll(into_model.drop(into_model.random_number)) for_finding_more = for_finding_more.drop(for_finding_more.random_number) #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among #the selected 10,000, have probabilities around 0.05 print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count())) all_columns = for_modeling.columns features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])] categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway procedure_features = [x for x in features if (x not in categorical_features)] #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects. #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries: #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature. cat_feature_number = 0 dict_cat_features = {} for feature in categorical_features: agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() distinct_values = map(lambda row: row.asDict().values()[0], agvalues) distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values)) dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values] cat_feature_number += 1 for_modeling = for_modeling.rdd print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8 (train, test) = for_modeling.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) print("training_data.count() = " + str(training_data.count())) t0 = time() #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) t0 = time() predictions = model.predict(test_data.map(lambda p: p.features)) tt = time() - t0 print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features))) test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size) fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count() print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282 model.clearThreshold() for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK try: for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD. #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000 #for_finding_more.take(5) except EOFError: print("EOF handled") df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom']) df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement. df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has #probability of 0.86818, last one has probability 0.5928958 except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return for_finding_more
return log_loss # In[10]: # try fixed hyperparameters numIters = 500 stepSize = 1 regParam = 1e-6 regType = 'l2' includeIntercept = True model0 = LogisticRegressionWithSGD.train(rawTrainData, iterations=numIters, step=stepSize, miniBatchFraction=1.0, initialWeights=None, regParam=regParam, regType=regType, intercept=includeIntercept) print model0.weights, model0.intercept # In[11]: classOneFracTrain = (rawTrainData.map(lambda x: x.label) .reduce(lambda x, y: x+y))/rawTrainData.count() print classOneFracTrain logLossTrBase = (rawTrainData.map(lambda x: x.label) .map(lambda x: computeLogLoss(classOneFracTrain, x)) .reduce(lambda x, y: x+y))/rawTrainData.count()
label = 0 values = [x if x < genre else x-1 for x in values] #shift the attributes by one index ones = [] ones = [1] * len(values) return LabeledPoint(label, SparseVector(column_num-1, values, ones)) #set hdfs path data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*") data = sc.sequenceFile("hdfs://localhost:9000/test/*") parsedData = data.filter(filterPoint).map(parsePoint).reduceByKey(lambda x, y : x + y).map(lambda (k, v) : list(set(v))) parsedData.cache() #Calculate total number of columns in the dataset column_num = parsedData.flatMap(lambda _ : _ ).distinct().count() column_id = parsedData.flatMap(lambda _ : _ ).distinct().collect() column_id.sort() #choose a genre to test, default is 100th column as target variable genre = 1 sortedData = parsedData.map(sortPoint) labeledData = sortedData.map(lambda line : (line, genre)).map(labelData) LRSGDmodel = LogisticRegressionWithSGD.train(labeledData) print LRSGDmodel.weights
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
print(BASE_DATA_PATH) conf = (SparkConf().setMaster("local[2]").setAppName("Testing MLLib With DataFrame SQL")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # read the dataset df_test = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=",").options(header="true").load( BASE_DATA_PATH + '/test.csv') training = df_test.map(lambda row: LabeledPoint(row.IsClick, [float(row.SearchID), float(row.AdID), float(row.Position), float(row.HistCTR), float(row.Price)])) (trainingData, testData) = training.randomSplit([0.7, 0.3]) model = LogisticRegressionWithSGD.train(trainingData,iterations = 100,step=0.4) # Build the model model1 = SVMWithSGD.train(trainingData, iterations=100) # Evaluate the model on training data model2 = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto",
OHETrainData = rawTrainData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create train labeled points OHETrainData.cache() ##cache OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create validation labeled points OHEValidationData.cache() # running first model with fixed hyperparameters numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True print "-------------logistic regression with gradient descent---------" model0 = LogisticRegressionWithSGD.train(data=OHETrainData, iterations=numIters, step=stepSize,regParam=regParam, regType=regType, intercept=includeIntercept) ##train model sortedWeights = sorted(model0.weights) print "------------/logistic regression with gradient descent---------" def computeLogLoss(p, y): epsilon = 10e-12 if (p==0): p = p + epsilon elif (p==1): p = p - epsilon if y == 1: z = -log(p) elif y == 0:
.map(lambda lp: len(lp.features.indices)) .sum()) Test.assertEquals(numNZVal, 372080, 'incorrect number of features') # CTR预估和对数损失函数评估,引用MLlib API from pyspark.mllib.classification import LogisticRegressionWithSGD numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True model0 = LogisticRegressionWithSGD.train(OHETrainData,iterations=numIters,step=stepSize,regParam=regParam,regType=regType,intercept=includeIntercept) sortedWeights = sorted(model0.weights) print sortedWeights[:5], model0.intercept Test.assertTrue(np.allclose(model0.intercept, 0.56455084025), 'incorrect value for model0.intercept') Test.assertTrue(np.allclose(sortedWeights[0:5], [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304, -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights') # log损失 from math import log def computeLogLoss(p, y): epsilon = 10e-12 if y == 1 :
.map(lambda lp: len(lp.features.indices)) .sum()) Test.assertEquals(numNZVal, 372080, 'incorrect number of features') # ** CTR prediction and logloss evaluation ** from pyspark.mllib.classification import LogisticRegressionWithSGD # fixed hyperparameters numIters = 50 stepSize = 10. regParam = 1e-6 regType = 'l2' includeIntercept = True model0 = LogisticRegressionWithSGD.train(OHETrainData, numIters, stepSize, 1.0, None, regParam, regType, includeIntercept) sortedWeights = sorted(model0.weights) print sortedWeights[:5], model0.intercept # TEST Logistic regression Test.assertTrue(np.allclose(model0.intercept, 0.56455084025), 'incorrect value for model0.intercept') Test.assertTrue(np.allclose(sortedWeights[0:5], [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304, -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights') # ** Log loss ** from math import log def computeLogLoss(p, y):
table1 = sc.textFile("/user/team322/junli_testFeature/*") def f1(line): line = str(line).replace('(','').replace(')','').replace('None','0') userID = line.split(',')[0] return userID user = table1.map(f1).collect() #select the users of validation data result6 = sc.textFile("/user/team322/junli_trainFeature/*") # Load and parse the data def parsePoint(line): line = str(line).replace('(','').replace(')','').replace('None','0') line = line.split(',') values = [float(x) for x in line[2:]] #select label Column and features Columns return LabeledPoint(values[0], values[1:]) parsedData = result6.map(parsePoint) # Build the model model = LogisticRegressionWithSGD.train(parsedData) result7 = sc.textFile("/user/team322/junli_testFeature/*") def testParsePoint(line): line = str(line).replace('(','').replace(')','').replace('None','0') line = line.split(',') values = [float(x) for x in line[1:]] #select label Column and features Columns return LabeledPoint(values[0], values[1:]) parsedData2 = result7.map(testParsePoint) preds = parsedData2.map(lambda p: model.predict(p.features)) #use the model to predict parsedData2 preds = preds.collect() #translate the result of predict into list userID = [] for i in xrange(len(preds)): #select users whose predict is 1 if preds[i] == 1: userID.append(user[i]) sc.parallelize(userID).saveAsTextFile('/user/team322/solution_v') #create a parallelized collection and save it t2 = time.ctime()
all_types = [] for i in [str(i) for i in title.split(",")]: schema = all_types.append(StructField(i, StringType(), True)) schema = StructType(all_types) from pyspark.sql import Row from pyspark.mllib.classification import LogisticRegressionWithSGD from numpy import array from pyspark.mllib.regression import LabeledPoint D = 2 ** 24 def helper1(r): features = [] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_" + str(i) + fe[i]))) % D) target = float(r[-1]) ID = float(r[0]) return LabeledPoint(target, features) except: return LabeledPoint(0.0, [0.0] * 1932) new_rdd = rdd.filter(lambda i: len(i) == 1934) df = new_rdd.map(helper1) model = LogisticRegressionWithSGD.train(df) df.take(1)
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD def parsePoint(line): """ Parse a line of text into an MLlib LabeledPoint object. """ values = [float(s) for s in line.split(' ')] if values[0] == -1: # Convert -1 labels to 0 for MLlib values[0] = 0 return LabeledPoint(values[0], values[1:]) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: logistic_regression <file> <iterations>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonLR") points = sc.textFile(sys.argv[1]).map(parsePoint) iterations = int(sys.argv[2]) model = LogisticRegressionWithSGD.train(points, iterations) print("Final weights: " + str(model.weights)) print("Final intercept: " + str(model.intercept)) sc.stop()
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
from pyspark.mllib.regression import LabeledPoint from numpy import array import parse # Load and parse the data #def parsePoint(line): # Creating vector(array) with first input as y and others as xi's # values = [float(x) for x in line.split(',')] # return LabeledPoint(values[10], values[0:9]) sc = SparkContext("local[4]", "Logistic Regression") #Initialized SparkContext data = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.data") #Created an RDD parsedData = data.map(parse.parsePoint) #RDD Transformation on the input RDD which is string and converting them to labeled points and each labeled points is a tuple of float(label) and ndrarray(features) # Build the model model = LogisticRegressionWithSGD.train(parsedData) #Pass an RDD to "train" method of class LogisticRegressionwithSGD #Use model to create output #model.predict().collect() # in "predict" method we have to pass an array #Read Test data Testdata = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.test") parsedTestData = Testdata.map(parse.parsePoint) #predict result for each Test Data # Evaluating the model on training data labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features))) #Taking each array of the RDD of parsedTestData which is a tuple(LabeledPoint) and then calculating its label and features , p is an input to lambda function and p is a tuple point(a LabeledPoint) millis2 = int(round(time.time() * 1000)) print labelsAndPreds.collect() #Print testing Error
cutoff = float(nrock) / (nrock + nxrock) # recombine equalSampleData = labeledRock.union(labeledNotRock) equalSampleData = labeledData.filter(lambda p: random.random() < cutoff if p.label != 1.0 else True) # split data trainData, testData = randomSplit(equalSampleData, [0.9, 0.1]) trainData.map(lambda p: (p.label, p.features)).take(3) # train model model = LogisticRegressionWithSGD.train(trainData, intercept=False, iterations=10000) # model = LinearRegressionWithSGD.train(trainData, step = 0.1, iterations=1000) # model = SVMWithSGD.train(trainData, step=1, iterations=1000, intercept=True) # evaluate model # labelsAndPreds = testData.map(lambda p: (p.label, 1 if model.predict(p.features) > 0.5 else 0)) labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testData.count()) guess1 = labelsAndPreds.filter(lambda (v, p): p == 1) precision1 = guess1.filter(lambda (v, p): v == p).count() / float(guess1.count()) act1 = labelsAndPreds.filter(lambda (v, p): v == 1) recall1 = act1.filter(lambda (v, p): v == p).count() / float(act1.count())
def train(self, num_iterations=10): model = LogisticRegressionWithSGD.train( self._labeled_feature_vector_rdd(), num_iterations) return LogisticRegressionModel(model, self.feature_cols)
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF from pyspark.mllib.classification import LogisticRegressionWithSGD conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) spam = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/spam.txt") normal = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/ham.txt") # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) normalFeatures = normal.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (normal) examples. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache() # Cache since Logistic Regression is an iterative algorithm. # Run Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Test on a positive example (spam) and a negative one (normal). We first apply # the same HashingTF feature transformation to get vectors, then apply the model. posTest = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) negTest = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) print "Prediction for positive test example: %g" % model.predict(posTest) print "Prediction for negative test example: %g" % model.predict(negTest)
splits = parsedData.randomSplit((0.9, 0.1)) train_set = splits[0] train_set.cache() test_set = splits[1] test_set.cache() #NBmodel = NaiveBayes.train(train_set) #NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) #findCoveragePercent(NB_socredLabel, 0.4) SVMSGDmodel = SVMWithSGD.train(train_set) SVMSGDmodel.clearThreshold() SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0)) LRSGDmodel = LogisticRegressionWithSGD.train(train_set) LRSGDmodel.clearThreshold() LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0)) LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set) LRLBFGSmodel.clearThreshold() LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.4)) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.8)) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 1.0)) def getAccumulatedPercentage(socredLabel): result = [] total = socredLabel.sum()