def train_single_SVM_model(self, dataset): """ Train a single model using SVM (Support Vector Machine) algorithm. :param dataset: paper ids used for training :return: a SVM model """ Logger.log("train_single_SVM_model") if (self.model_training == "imp"): # create User Labeled Points needed for the model def createUserLabeledPoint(line): # peer_paper_id | paper_id | user_id | features | label # userId, label, features return UserLabeledPoint(int(line[2]), line[4], line[3]) # convert data points data frame to RDD labeled_data_points = dataset.rdd.map(createUserLabeledPoint) Logger.log("Number of partitions for labeled data points: " + str(labeled_data_points.getNumPartitions())) # Build the model lsvcModel = LTRSVMWithSGD().train(labeled_data_points, intercept=False, validateData=False) return lsvcModel if (self.model_training == "cmp"): # select only those papers in the training set that are liked by users in the cluster cluster_dataset = dataset.join(self.user_clusters, self.userId_col) # create User Labeled Points needed for the model def createUserLabeledPoint(line): # user_id | peer_paper_id | paper_id | features | label | cluster_id | # clusterId, label, features return UserLabeledPoint(int(line[-1]), line[4], line[3]) # convert data points data frame to RDD labeled_data_points = cluster_dataset.rdd.map( createUserLabeledPoint) # Build the model lsvcModel = LTRSVMWithSGD().train(labeled_data_points, validateData=False, intercept=False) return lsvcModel else: # create Label Points needed for the model def createLabelPoint(line): # label, features # paper_id | peer_paper_id | user_id | citeulike_paper_id | features | label return LabeledPoint(line[-1], line[-2]) # convert data points data frame to RDD labeled_data_points = dataset.rdd.map(createLabelPoint) # Build the model lsvcModel = SVMWithSGD().train(labeled_data_points, validateData=False, intercept=False) return lsvcModel Logger.log("Training LTRModel finished.")
hamExample = tf.transform("Spark is really good at big data processing".split(" ")) print(model.predict(spamExample)) print(model.predict(hamExample)) from pyspark.mllib.classification import LogisticRegressionWithLBFGS algo = LogisticRegressionWithLBFGS() model = algo.train(training_data) score(model) #### Support Vector Machines #### What about SVMs, another popular algorithm? from pyspark.mllib.classification import SVMWithSGD algo = SVMWithSGD() model = algo.train(training_data) score(model) ##### Trees ##### ##### Now let’s try three variants of tree-based classification. ##### The API is slightly different from previous algos. from pyspark.mllib.tree import DecisionTree from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.tree import RandomForest algo = DecisionTree()
[training_data, test_data] = samples.randomSplit([0.8, 0.2]) training_data.cache() test_data.cache() algorithm = LogisticRegressionWithSGD() model = algorithm.train(training_data) print('logistic regression sgd:', score(model)) algorithm = LogisticRegressionWithLBFGS() model = algorithm.train(training_data) print('logistic regression with lbfgs:', score(model)) # algorithm = DecisionTree() # model = algorithm.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={}) # print('decision tree: ',score(model)) # # algorithm = RandomForest() # model = algorithm.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16) # print('random forest: ',score(model)) algorithm = NaiveBayes() model = algorithm.train(training_data) print('naive bayes: ', score(model)) algorithm = SVMWithSGD() model = algorithm.train(training_data, iterations=10) print('svm with sgd: ', score(model)) # model.save(sc,"classifierModelPLOS") if __name__ == "__main__": pass
idf = IDF().fit(tf) tfidf = idf.transform(tf).collect() #data2 =tfidf.collect() # a = lists.collect() # b = tfidf.collect() # print "type tfidf {} len {}".format(type(b),len(b)) # c=b[0] # print "type c {} len {}".format(type(c),len(c)) ## cross-validaton/Grid-search: cv = ShuffleSplit(len(tfidf), n_iter=3, test_size=0.3, random_state=42) nb = NaiveBayes() lr = LogisticRegressionWithLBFGS() svm = SVMWithSGD() models = [lr, nb, svm] scores = {model.__name__: [] for model in models} grids = [ParamGridBuilder()\ .baseOn({lr.labelCol: 'l'})\ .baseOn([lr.predictionCol, 'p'])\ .addGrid(lr.regParam, [1.0, 2.0]) \ .addGrid(lr.maxIter, [0, 1]).build(),\ ParamGridBuilder()\ .addGrid(nb.lambda_, [0, 1]) \ .addGrid(nb.maxIter, [0, 1]).build() ,\ ParamGridBuilder()\ .baseOn({svm.labelCol: 'l'})\ .baseOn([svm.predictionCol, 'p'])\ .addGrid(svm.regParam, [1.0, 2.0]) \ .addGrid(svm.maxIter, [0, 1]).build()
###标准化 stdscaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD) scaledFeature = stdscaler.transform(featureRDD) labelPoint = labelRDD.zip(scaledFeature) labelPointRDD = labelPoint.map(lambda x:LabeledPoint(x[0],x[1])) #model model = LogisticRegressionWithSGD.train(labelPointRDD,num_iter,learning_rate,batch_size) # svm from pyspark.mllib.classification import SVMWithSGD model = SVMWithSGD(trainData,num_iter,learning_rate,regParam) #naiveBayes from pyspark.mllib.classification import NaiveBayes model = NaiveBayes.train(trainData,lambdaParam) #DataFrame sqlContext = SparkSession.builder.getOrCreate() from pyspark.sql import Row userRDD = rawUserRDD.map(x=>x.split("|")) user_row = userRDD.map(lambda p:Row(userid=int(p[0]),age=int(p[1]),gender=p[2])) # schema
return LabeledPoint(label, Vectors.dense(features)) nb_data = records.map(lambda r: labeled_point_nb(r)) print("the first data of nb data and the count of nb data:") print(nb_data.first()) #start train model num_iterations = 10 max_tree_depth = 5 lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations) print("logistic regression model :") print(lr_model) svm_model = SVMWithSGD().train(data, num_iterations) print("svm model :") print(svm_model) nb_model = NaiveBayes().train(nb_data) print("naive bayes model :") print(nb_model) dt_model = DecisionTree().trainClassifier(data, 2, {}) print("decision tree model :") print(dt_model) #start predict data_point = data.first() lr_prediction = lr_model.predict(data_point.features) print("logistic model prediction :" + str(lr_prediction))
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD from pyspark.ml.tuning import ParamGridBuilder model_list = { "Linear-SVM": SVMWithSGD(), "LogisticRegression": LogisticRegressionWithSGD() } params_list = { "LogisticRegression" : ParamGridBuilder()\ .addGrid(LogisticRegressionWithSGD.regParam, [0.1, 0.01]) \ .addGrid(LogisticRegressionWithSGD.step, [0.1, 0.01])\ .addGrid(LogisticRegressionWithSGD.miniBatchFraction, [0.1, 0.5, 1.0])\ .addGrid(LogisticRegressionWithSGD.regType, ['l1', 'l2', None])\ .addGrid(LogisticRegressionWithSGD.convergenceTol, [0.001, 0.0001]) .build(), "Linear-SVM": ParamGridBuilder()\ .addGrid(SVMWithSGD.regParam, [0.1, 0.01]) \ .addGrid(SVMWithSGD.step, [0.1, 0.01])\ .addGrid(SVMWithSGD.miniBatchFraction, [0.1, 0.5, 1.0])\ .addGrid(SVMWithSGD.regType, ['l1', 'l2', None])\ .addGrid(SVMWithSGD.convergenceTol, [0.001, 0.0001]) .build() } if __name__ == "__main__": # SVMWithSGD.train(training_data, estimatorParamMaps=grid) # SVMWithSGD.predict(test_data) # LogisticRegressionWithSGD.train(data, estimatorParamMaps=grid) # LogisticRegressionWithSGD.predict(test_data)