def ctr_gbdt(file_dir): sc = SparkContext(appName="CTRGBDTRegression") path = file_dir + CTR_TRAINING_DATA + "/part*" data = sc.textFile(path) (training_data, testData) = data.randomSplit([0.7, 0.3]) parsed_train_data = training_data.map(_parse_point) parsed_test_data = testData.map(_parse_point) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(parsed_train_data, categoricalFeaturesInfo={}, numIterations=100) # Evaluate model on test instances and compute test error predictions = model.predict(parsed_test_data.map(lambda x: x.features)) labels_and_predictions = parsed_test_data.map(lambda lp: lp.label).zip( predictions) test_err = labels_and_predictions.filter( lambda vp: vp[0] != vp[1]).count() / float(parsed_test_data.count()) logger = logging.getLogger() logger.debug('GBDT Training Error = ' + str(test_err)) logger.debug('Learned classification GBT model:') logger.debug(model.toDebugString()) logger.debug("Tree totalNumNodes" + str(model.totalNumNodes())) # Save and load model ctr_gbdt_data = file_dir + CTR_GBDT_DATA model.save(sc, ctr_gbdt_data) logger.info("GBDT training finished")
def train_model(cls, trianData, cateFeaInfo={}, iterTimes=3): """ 训练模型 """ model = GradientBoostedTrees.trainClassifier(trianData, \ categoricalFeaturesInfo=cateFeaInfo, numIterations=iterTimes) return model
def trainevaluatemodel_gbdt(traindata,validationdata,loss,numiterations,learningrate,maxdepth,maxbins): starttime=time() model=GradientBoostedTrees.trainClassifier(traindata, categoricalFeaturesInfo={}, loss=loss, numIterations=numiterations, learningRate=learningrate,maxDepth=maxdepth, maxBins=maxbins) index=evaluation(model,validationdata) duration=time()-starttime print('Param:'+'\n'+'loss:'+str(loss)+'\n'+'numiterations:'+str(numiterations)+'\n'+'learningrate:'+str(learningrate)+'\n'+'maxdepth:'+str(maxdepth)+'\n'+'maxbins:'+str(maxbins)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index)) return (loss,numiterations,learningrate,maxdepth,maxbins,duration,index)
def run_GBDT(input_file,output_file,iterations): dataRDD=sc.textFile(input_file).map(lambda x: x.replace('\t',',')) #Now let us create labeled point from data dataRDDParsed=dataRDD.map(parsePoint).cache() featSet=dataRDDParsed.flatMap(lambda x: x).map(maaro).reduceByKey(lambda a,b: a+b).takeOrdered(26,lambda (k,v): -v) #reduceByKey(lambda x,y:x+y).takeOrdered(25,lambda (k,v):-v) #print featSet #OHEdict=createOneHotDict(dataRDDParsed,featSet) OHEdict={} for i,x in enumerate(featSet): # print i,x OHEdict[x[0]]=i #print oneHotEncoding(dataRDDParsed,OHEdict,numSampleOHEFeats,) #Now let us create a dictionary of points # weights=[.8,.1,.1] # seed=42 # trainRDD,validateRDD,testRDD=dataRDD.randomSplit(weights,seed) # OHETrainData = trainRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39)) OHETrainData = dataRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39)) # print OHETrainData.take(1) # print OHETrainData.count() model = (GradientBoostedTrees.trainClassifier(OHETrainData, loss = 'logLoss', numIterations=2, categoricalFeaturesInfo={}, learningRate = 0.1, maxDepth = 7, maxBins = 2)) sc.parallelize([model.toDebugString()]).coalesce(1).saveAsTextFile(output_file)
def train(self): neg_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('neg.csv') pos_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('pos.csv') test_pos_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('ptest.csv') test_neg_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('ntest.csv') training_df = neg_df.union(pos_df) test_df = test_neg_df.union(test_pos_df) labelpointRdd = training_df.rdd.map(featureExtraction).map( lambda x: LabeledPoint(x[0], x[1:])).cache() TestlabelpointRdd = test_df.rdd.map(featureExtraction).map( lambda x: LabeledPoint(x[0], x[1:])).cache() GBTmodel = GradientBoostedTrees.trainClassifier( labelpointRdd, categoricalFeaturesInfo={}, numIterations=75) predictions = GBTmodel.predict( TestlabelpointRdd.map(lambda x: x.features)) labelsAndPredictions = TestlabelpointRdd.map(lambda lp: lp.label).zip( predictions) # save model GBTmodel.save(sc, '.') return score(labelsAndPredictions)
def main(): text = sc.textFile(inputs) nltk_data_path = "[change yo your own nltk_data location]" # maybe changed to the sfu server path nltk.data.path.append(nltk_data_path) cleaned_review = text.map(clean_reviewf).cache() reviews_txt = cleaned_review.map(lambda review: review['reviewText']) reviews = cleaned_review.map(lambda review: (review['overall'], review['reviewText'], review['reviewTime'])).cache() training_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year < 2014) testing_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year == 2014) training_data = training_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() testing_data = testing_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() training_rating = training_data.map(lambda ((rating, review_text), review_index): (review_index, rating)) training_review_text = training_data.map(lambda ((rating, review_text), review_index): (review_index, review_text)) training_review_text_flat = training_review_text.flatMapValues(myf) training_review_text_flat = training_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index)) testing_rating = testing_data.map(lambda ((rating, review_text), review_index): (review_index, rating)) testing_review_text = testing_data.map(lambda ((rating, review_text), review_index): (review_index, review_text)) testing_review_text_flat = testing_review_text.flatMapValues(myf) testing_review_text_flat = testing_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index)) word2vec_model = generate_word2vec_model(reviews_txt) mv = word2vec_model.getVectors() # this step seems redundant but necessary mvdct = [] for k,v in mv.items(): vec = [f for f in v] mvdct.append((k,vec)) dct_rdd = sc.parallelize(mvdct) training_feature_vecs = dct_rdd.join(training_review_text_flat) training_vecs = training_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1))) training_reduce_vecs = training_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1])) training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct))) training_rating_avgf = training_rating.join(training_avg_vecs) training_lps = training_rating_avgf.map(get_lp) testing_feature_vecs = dct_rdd.join(testing_review_text_flat) testing_vecs = testing_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1))) testing_reduce_vecs = testing_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1])) testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct))) testing_rating_avgf = testing_rating.join(testing_avg_vecs) testing_lps = testing_rating_avgf.map(get_lp) gbt_model = GradientBoostedTrees.trainClassifier(training_lps, categoricalFeaturesInfo={}, numIterations=20) predictions = gbt_model.predict(testing_lps.map(lambda x: x.features)) labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(predictions) MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /float(testing_lps.count()) RMSE = math.sqrt(MSE) result = str(RMSE) outdata = sc.parallelize([result]) outdata.saveAsTextFile(output)
def testClassification(trainingData, testData): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count() / float(testData.count()) print("Test Error = " + str(testErr)) print("Learned classification ensemble model:") print(model.toDebugString())
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def GBDT_train(data, filename): data_train = split_data(data, [0.5, 0.5]) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_GBDT = GradientBoostedTrees.trainClassifier(training, {}, numIterations=20) predictionAndlabel = test.map( lambda x: (float(model_GBDT.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_GBDT:%f" % accuracy) pre_all(data, model_GBDT, filename) return model_GBDT, accuracy
def testClassification(trainingData, testData): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() \ / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification ensemble model:') print(model.toDebugString())
def fit(model, df, param): spark = model['spark'] sc = spark.sparkContext feature_variables = param['feature_variables'] target_variable = param['target_variables'][0] iterations = 10 if 'options' in param: if 'params' in param['options']: if 'iterations' in param['options']['params']: iterations = int(param['options']['params']['iterations']) sdf = spark.createDataFrame(df) rdd = sdf.rdd.map(lambda row: LabeledPoint(row[ target_variable], [row[x] for x in feature_variables])) model['model'] = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo={}, numIterations=iterations) info = {"message": "model trained"} return info
def Gradient_BoostedTrees(filename, sc): # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString())
def crossValidator(IterNums,dataset_rdd,rate): dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5) dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5) # dataset_positive1,dataset_positive2,dataset_positive3,dataset_positive4,dataset_positive5 = dataset_positive.randomSplit([1,1,1,1,1]) # dataset_negotive1,dataset_negotive2,dataset_negotive3,dataset_negotive4,dataset_negotive5 = dataset_negotive.randomSplit([1,1,1,1,1]) dataset_positive_list = dataset_positive.randomSplit([1,1,1,1,1]) dataset_negotive_list = dataset_negotive.randomSplit([1,1,1,1,1]) result = [] #result2 = [] for i in range(5): testset_positive = dataset_positive_list[i].count() testset_rdd = dataset_positive_list[i].union(dataset_negotive_list[i]) testset_count = testset_rdd.count() trainset_rdd = dataset_rdd.subtract(testset_rdd) trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) model = GradientBoostedTrees.trainClassifier(trainset, {}, numIterations=IterNums,learningRate = rate) #model2 = LogisticRegressionWithLBFGS.train(trainset,iterations = 100) predictions = model.predict(testset.map(lambda x:x.features)) #predictions2 = model2.predict(testset.map(lambda x:x.features)) predict = testset.map(lambda lp: lp.label).zip(predictions) #predict2 = testset.map(lambda lp:lp.label).zip(predictions2) hitALL =predict.filter(lambda e:e[0]==e[1]).count() #hitALL2 = predict2.filter(lambda e:e[0]==e[1]).count() hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() #hitPositive2 = predict2.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() positive = predict.filter(lambda e:e[1]>0.5).count() #positive2 = predict2.filter(lambda e:e[1]>0.5).count() recall = hitPositive/float(testset_positive) #recall2 = hitPositive2/float(testset_positive) precision = hitPositive/float(positive) #precision2 = hitPositive2/float(positive2) accuracy = hitALL/float(testset_count) #accuracy2 = hitALL2/float(testset_count) F_Value = 2/(1/precision+1/recall) #F_Value2 = 2/(1/precision2+1/recall2) result.append((precision,recall,accuracy,F_Value,hitPositive,positive,testset_positive,testset_count))
def precreate_models(train_data): models = list() for depth in range(9, 10): for num_trees in range(4, 10, 3): for impurity in ['entropy']: # ['gini', 'entropy'] for feature in [ 'onethird' ]: # ['auto', 'all', 'sqrt', 'log2', 'onethird'] models.append( RandomForest.trainClassifier( train_data, numClasses=10, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy=feature, impurity=impurity, maxDepth=depth, maxBins=32)) for iters in range(9, 10): for rate in np.linspace(0.1, 1, 2): for depth in range(9, 10): for loss in [ 'leastSquaresError' ]: # ['logLoss', 'leastSquaresError', 'leastAbsoluteError'] models.append( GradientBoostedTrees.trainClassifier( train_data, categoricalFeaturesInfo={}, loss=loss, numIterations=iters, learningRate=rate, maxDepth=depth)) return models
# $example off$ if __name__ == "__main__": sc = SparkContext( appName="PythonGradientBoostedTreesClassificationExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingClassificationModel") sameModel = GradientBoostedTreesModel.load( sc, "target/tmp/myGradientBoostingClassificationModel")
# data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields datamodel = dmt.computeDataModel(df) DataModelTools.checkTargetForModelType(datamodel,target,model_type) # use DataModelTools to convert from DataFrame to an RDD of LabelledPoint for specified target/predictors lp = dmt.extractLabelledPoint(df,target,predictors).map(lambda x:x[1]).cache() # build the decision tree model from pyspark.mllib.tree import GradientBoostedTrees if model_type == "classification": model = GradientBoostedTrees.trainClassifier( lp, categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors), loss=loss_param, numIterations=numIterations_param, learningRate=learningRate_param, maxDepth=maxDepth_param, maxBins=maxBins_param) else: # regression model = GradientBoostedTrees.trainRegressor( lp, categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors), loss=loss_param, numIterations=numIterations_param, learningRate=learningRate_param, maxDepth=maxDepth_param, maxBins=maxBins_param) build_report = mbr.report(lp.count(),lp.getNumPartitions(),
path='/covtype/covtype.data' inputRDD=sc.textFile(path) Label=2.0 Data = inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint((V[-1]==Label), V[:-1])).cache() (trainingData,testData) = Data.randomSplit([0.7,0.3],seed=255) from time import time errors={} catInfo = {} for i in range(10,54): catInfo[i] = 2 for depth in [10]: start=time() model=GradientBoostedTrees.trainClassifier(trainingData,learningRate = 0.2, numIterations = 30, maxDepth = depth, categoricalFeaturesInfo=catInfo) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print errors
# In[8]: #Split the training set and test set (trainingData, testData) = data.randomSplit([0.7, 0.3]) # In[9]: #Training model RF_model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32) GB_model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # In[10]: #Predication def cal_mllib_accuracy(list): for i, clf in enumerate(list): #prediction with the features predictions = clf.predict(testData.map(lambda x: x.features)) #append with lables first then features labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).count()/testData.count() #compare results
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils # Read the file into an RDD # If doing this on a real cluster, you need the file to be available on all nodes, ideally in HDFS. path='/HIGGS/HIGGS.csv' inputRDD=sc.textFile(path) # Transform the text RDD into an RDD of LabeledPoints Data=inputRDD.map(lambda line: [float(strip(x)) for x in line.split(',')]) .map(lambda x: LabeledPoint(x[0], x[1:])) Data1=Data.sample(False,0.1, seed=255).cache() (trainingData,testData)=Data1.randomSplit([0.7,0.3],seed = 255) trainingData.cache() testData.cache() errors={} depth = 10 model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=30, learningRate=0.3, maxDepth=depth) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]
for x in featurs_raw: feature = float(x.strip().strip("'").strip()) features.append(feature) label = float(fields[11]) #print ("label=" + str(label)) return LabeledPoint(label,features) data = sc.textFile("/Users/jiayangan/project/SearchAds/data/log/ctr_features_demo3/part*") (trainingData, testData) = data.randomSplit([0.7, 0.3]) parsedTrainData = trainingData.map(parsePoint) parsedTestData = testData.map(parsePoint) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(parsedTrainData, categoricalFeaturesInfo={}, numIterations=100,maxDepth=3) # Evaluate model on test instances and compute test error predictions = model.predict(parsedTestData.map(lambda x: x.features)) labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedTestData.count()) print('training Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) print("tree totalNumNodes" + str(model.totalNumNodes())) # Save and load model model.save(sc, "/Users/jiayangan/project/SearchAds/data/model/ctr_gbdt_model_demo_20")
raise ValueError("Unsupported type {0}".format(type(v))) scaled_labelPoint_train = scaled_training.rdd.map( lambda row: LabeledPoint(row.label, as_old(row.features_scaled))) print(scaled_labelPoint_train.take(2)) labelPoint_train = training_dense.rdd.map( lambda row: LabeledPoint(row.label, as_old(row.features))) print(labelPoint_train.take(2)) print('Learned classification GBT model:') import time train_start = time.time() GBTmodel = GradientBoostedTrees.trainClassifier(labelPoint_train, categoricalFeaturesInfo={}, numIterations=30) train_end = time.time() print(f'Time elapsed training model: {train_end - train_start} seconds') # Evaluate model on test instances and compute test error predictions = GBTmodel.predict(test_dense.rdd.map(lambda x: x.features.values)) labelsAndPredictions = test_dense.rdd.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(test_dense.rdd.count()) print('Test Error = ' + str(testErr)) from pyspark.mllib.tree import RandomForest, RandomForestModel print('Learned classification RF model:')
# # # * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are: # * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context. # * `load(sc,path)` : The counterpart to save - load classifier from file. # * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints. # * `toDebugString()` : print the classifier in a human readable format. # In[32]: from time import time errors = {} for depth in [10]: start = time() model = GradientBoostedTrees.trainClassifier( trainingData, {}, maxDepth=depth, numIterations=30) ##FILLIN to generate 10 trees ##) #print model.toDebugString() errors[depth] = {} dataSets = {'train': trainingData, 'test': testData} for name in dataSets.keys(): # Calculate errors on train and test sets data = dataSets[name] Predicted = model.predict(data.map(lambda x: x.features)) LabelsAndPredictions = data.map(lambda lp: lp.label).zip( Predicted) ### FILLIN ### Err = LabelsAndPredictions.filter( lambda (v, p): v != p).count() / float(data.count()) errors[depth][name] = Err print depth, errors[depth] #,int(time()-start),'seconds' #print errors
return LabeledPoint(values[0], values[1:]) #train data load train_data_new = sc.textFile('/home/hduser/dataset.txt') parsedData = train_data_new.map(parsePoint) #test data load test_data_new = sc.textFile('/home/hduser/testfile.txt') test_final = test_data_new.map(parsePoint2) # Split train and test X_train, X_test = parsedData.randomSplit([0.8, 0.2]) #train the classifier model = GradientBoostedTrees.trainClassifier(X_train, categoricalFeaturesInfo={}, numIterations=10) #20% of training data predictions = model.predict(X_test.map(lambda x: x.features)) labelsAndPredictions1 = X_test.map(lambda p: p.label).zip(predictions) #test data predictions1 = model.predict(test_final.map(lambda x: x.features)) y_final = test_final.map(lambda p: p.label).zip(predictions1) er = labelsAndPredictions1.filter(lambda (v, p): v != p).count() / float( X_train.count()) acc = (1 - er) * 100 print('===============================================================') print(model.toDebugString()) print('===============================================================')
trainingData = trainingData.cache() testData = testData.cache() from time import time errors={} cfi = {} depth = 9 lr = 0.3 lossfunc = "logLoss" #stopER = 0.27498 stopER = 0.2745 testER = 1.0 attemption = 0 while testER>=stopER: start=time() model=GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo=cfi,loss="logLoss",maxDepth=depth,numIterations=10,learningRate=lr) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in ['test','train']: # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp:lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err if name=='test': if Err>=stopER: attemption+=1 break else: testER = Err print depth,errors[depth]
# In[6]: Data1=Data.sample(False,0.1, seed=255).cache() (trainingData,testData)=Data1.randomSplit([0.7,0.3],seed=255) # ###Gradient Boosted Trees # In[7]: from time import time errors={} for depth in [10]: model=GradientBoostedTrees.trainClassifier(Data1, categoricalFeaturesInfo={}, numIterations=10, maxDepth=depth, learningRate=0.25, maxBins=35) #print model.toDebugString() errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth] # In[ ]:
from pyspark.mllib.tree import RandomForest model = RandomForest.trainClassifier(train_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=20) print('RandomForest') predictions_and_labels = getPredictionsLabels(model, train_data) printMetrics(predictions_and_labels) acc = ModelAccuracy(model, train_data) print("准确率accuracy=" + str(acc)) predict = model.predict(test_data.map(lambda p: p.features)) predict_all = predict.zip(test_data.map(lambda p: p.features)) for i in range(0, 10): print(predict_all.collect()[i]) print('\n') from pyspark.mllib.tree import GradientBoostedTrees model = GradientBoostedTrees.trainClassifier(train_data, categoricalFeaturesInfo={}, maxDepth=5) print('GradientBoostedTrees') predictions_and_labels = getPredictionsLabels(model, train_data) printMetrics(predictions_and_labels) acc = ModelAccuracy(model, train_data) print("准确率accuracy=" + str(acc)) predict = model.predict(test_data.map(lambda p: p.features)) predict_all = predict.zip(test_data.map(lambda p: p.features)) for i in range(0, 10): print(predict_all.collect()[i])
# ### Gradient Boosted Trees # In[ ]: from time import time errors = {} if not PYBOLT: depths = [1, 3, 6, 10] else: depths = [10] for depth in depths: # 15, 20 start = time() model = GradientBoostedTrees.trainClassifier(trainingData, {}, maxDepth=depth, numIterations=30, learningRate=0.3) # numIterations is the numTrees #print model.toDebugString() errors[depth] = {} dataSets = {'train': trainingData, 'test': testData} for name in dataSets.keys(): # Calculate errors on TRAIN and TEST sets data = dataSets[name] Predicted = model.predict(data.map(lambda x: x.features)) LabelsAndPredictions = data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v, p): v != p).count() / float( data.count()) # zip errors[depth][name] = Err if not PYBOLT:
inputRDD=sc.textFile(path) # In[3]: input_sampled = inputRDD.sample(False,0.1, seed=255) Data=input_sampled.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint(V[0], V[1:])) # In[4]: Data.cache() (trainingData,testData)=Data.randomSplit([0.7,0.3], seed=255) # In[ ]: depth=10 model=GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo={}, numIterations=25,learningRate=0.2,maxDepth=depth) errors={} errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x : x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]
from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint # API docs # https://spark.apache.org/docs/1.4.0/api/python/pyspark.mllib.html # Function to convert .csv files into 'LabeledPoint' format def parsePoint(line): values = [float(x.strip()) for x in line.split(',')] return LabeledPoint(values[-1],values[:65]) # Load .csv data train_csv = sc.textFile("train.csv") test_csv = sc.textFile("test.csv") # Convert the data to LabeledPoint format train_parsed = train_csv.map(parsePoint) test_parsed = test_csv.map(parsePoint) # Build a GBM / TreeNet model model = GradientBoostedTrees.trainClassifier( train_parsed, loss='leastSquaresError', o categoricalFeaturesInfo={},numIterations=300, maxDepth=2,learningRate=0.1) # Get predictions and see how it did predictions = model.predict(test_parsed.map(lambda x: x.features)) labelsAndPredictions = test_parsed.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda t: t[0] != t[1]).count() / float(test_parsed.count()) print(testErr)
from pyspark.context import SparkContext from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint def parsePoint(line): values = [float(x.strip()) for x in line.split(',')] return LabeledPoint(values[-1],values[1:10]) data = sc.textFile("heart_disease.csv") data = sc.textFile("heart_disease.csv") data = data.map(parsePoint) (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # This works too! train = sc.textFile("train.csv") def parsePoint(line): values = [float(x.strip()) for x in line.split(',')] return LabeledPoint(values[-1],values[:65]) train = train.map(parsePoint) model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={}, numIterations=300, maxDepth=2,learningRate=0.1) test = sc.textFile("test.csv") test = test.map(parsePoint) predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
############################################################################### # Gradient boosted trees if 'GBT_mllib' in model_list: t0=datetime.datetime.now() # Only change parameters here GBT_mllib_par_dict={} #GBT_mllib_par_dict['numClasses'] = 2 GBT_mllib_par_dict['loss'] = 'logLoss' GBT_mllib_par_dict['maxDepth'] = 10 GBT_mllib_par_dict['numIterations'] = 40 GBT_mllib_par_dict['learningRate'] = 0.1 model_mllib_GBT = GradientBoostedTrees.trainClassifier(trainingData_rdd, categoricalFeaturesInfo={}, learningRate=GBT_mllib_par_dict['learningRate'], loss=GBT_mllib_par_dict['loss'], numIterations=GBT_mllib_par_dict['numIterations'], maxDepth=GBT_mllib_par_dict['maxDepth']) mllib_model_accuracy('GBT',model_mllib_GBT,trainingData_rdd,testData_rdd) modelparameter_dict['GBT_mllib']=GBT_mllib_par_dict runtime_write('GBT_mllib',t0) ############################################################################### # Logistic regression if 'logreg_mllib' in model_list: t0=datetime.datetime.now() #classmethod train(data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, #regParam=0.01, regType='l2', intercept=False, validateData=True)
def parsePoint2(line): values= [float(x) for x in line.split(',')] return LabeledPoint(values[0], values[1:]) #train data load train_data_new = sc.textFile('/home/hduser/dataset.txt') parsedData = train_data_new.map(parsePoint) #test data load test_data_new = sc.textFile('/home/hduser/testfile.txt') test_final = test_data_new.map(parsePoint2) # Split train and test X_train, X_test = parsedData.randomSplit([0.8,0.2]) #train the classifier model=GradientBoostedTrees.trainClassifier(X_train,categoricalFeaturesInfo={},numIterations=10) #20% of training data predictions=model.predict(X_test.map(lambda x: x.features)) labelsAndPredictions1 = X_test.map(lambda p: p.label).zip(predictions) #test data predictions1=model.predict(test_final.map(lambda x: x.features)) y_final = test_final.map(lambda p: p.label).zip(predictions1) er =labelsAndPredictions1.filter(lambda (v, p): v != p).count() / float(X_train.count()) acc = (1 - er)*100 print('===============================================================') print(model.toDebugString()) print('===============================================================') for i in y_final.collect():
# rf_train_predict_label = [] # rf_test_predict_label = [] # # Predict Labels # for j in range(0, len(test_features), 1): # p_l = model.predict(test_features[j]) # rf_test_predict_label.extend([p_l]) # final_predict_labels = rf_test_predict_label # # Gradient Boosted Trees # # C14 - C21 # # Build the Model # # Build the Model model = GradientBoostedTrees.trainClassifier(train_data, {}, numIterations=10) gbt_train_predict_label = [] gbt_test_predict_label = [] # Predict Labels for j in range(0, len(test_features), 1): p_l = model.predict(test_features[j]) gbt_test_predict_label.extend([p_l]) final_predict_labels = gbt_test_predict_label # Calculate Precision, Recall and F1 Score tp = 0.0 fp = 0.0 tn = 0.0
trainingData = trainingData.map(lambda x: LabeledPoint(x[0], x[1])) # <=================================================================================================================> # RandomForest Classifier maxDepth_selection = [5, 10, 15, 20, 30] maxBins_selection = [10, 20, 30, 40] model_rf = RandomForest.trainClassifier(trainingData, numClasses = 8, \ numTrees = 800, featureSubsetStrategy = "auto", \ impurity = 'gini', maxDepth = 5, maxBins = 30) predictions_rf = model_rf.predict(testData.map(lambda x: x[1])) labelsAndPredictions_rf = testData.map(lambda x: x[0]).zip(predictions_rf) testErr_rf = labelsAndPredictions_rf.filter( lambda (v, p): v != p).count() / float(testData.count()) print "Precision is " + testErr_rf # <=================================================================================================================> # Gradient Boost Decision Tree # tunning order learningRate_selection = [0.1, 0.2, 0.3] maxDepth_selection = [5, 10, 15, 20, 30] model_xgbt = GradientBoostedTrees.trainClassifier(trainingData, numClasses = 8, \ loss = 'logLoss', numIterations = 800, \ learningRate = 0.1, maxDepth = 10) predictions_xgbt = model_xgbt.predict(testData.map(lambda x: x[1])) labelsAndPredictions_xgbt = testData.map(lambda x: x[0]).zip(predictions_xgbt) testErr_xgbt = labelsAndPredictions_xgbt.filter( lambda (v, p): v != p).count() / float(testData.count()) print "Precision is " + testErr_xgbt
# reference http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.tree pyspark.StorageLevel(True, False, False, False, 1) sc = SparkContext("local", "ad RD leave predict") if __name__ == '__main__': prepareLeaverList('../var/all2015rdleaves') data = [] l = 500 for i in range(6): data.extend(buildDataFromCouchbase(l, l * i)) cataDict = getCataDict() model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), cataDict, numIterations=10, maxBins=500) print(model.numTrees()) print(model.totalNumNodes()) # # print('===== random predict===') # idxManager = ManagerCata.index('Alex Tseng (RD-TW)') # print('idx manager = '+str(idxManager)) # print(model.predict([0.0,0.0,idxManager])) # print(model.predict([0.0,1.0,idxManager])) # print(model.predict([1.0,0.0,idxManager])) # print(model.predict([1.0,1.0,idxManager])) # rdd = sc.parallelize([[2.0], [0.0]]) # model.predict(rdd).collect()
# * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories # # # * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are: # * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context. # * `load(sc,path)` : The counterpart to save - load classifier from file. # * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints. # * `toDebugString()` : print the classifier in a human readable format. # In[32]: from time import time errors={} for depth in [10]: start=time() model=GradientBoostedTrees.trainClassifier(trainingData, {},maxDepth=depth, numIterations=30)##FILLIN to generate 10 trees ##) #print model.toDebugString() errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) ### FILLIN ### Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]#,int(time()-start),'seconds' #print errors # In[33]:
p_l = model.predict(test_features[j]) rf_test_predict_label.extend([p_l]) for j in range(0, len(train_features), 1): p_l = model.predict(train_features[j]) rf_train_predict_label.extend([p_l]) # Append Labels appendColumn(ensemble_test, rf_test_predict_label) appendColumn(ensemble_train, rf_train_predict_label) # Gradient Boosted Trees # C14 - C21 # Build the Model # Build the Model model = GradientBoostedTrees.trainClassifier(train_data, {}) gbt_train_predict_label = [] gbt_test_predict_label = [] # Predict Labels for j in range(0, len(test_features), 1): p_l = model.predict(test_features[j]) gbt_test_predict_label.extend([p_l]) for j in range(0, len(train_features), 1): p_l = model.predict(train_features[j]) gbt_train_predict_label.extend([p_l]) # Append Labels appendColumn(ensemble_test, gbt_test_predict_label)
# In[45]: Data1 = Data.sample(False, 0.1, seed=255).cache() (trainingData, testData) = Data1.randomSplit([0.7, 0.3], seed=255) # print 'Sizes: Data1=%d, trainingData=%d, testData=%d'%(Data1.count(),trainingData.cache().count(),testData.cache().count()) # In[59]: from time import time errors = {} for depth in [10]: model = GradientBoostedTrees.trainClassifier(Data1, categoricalFeaturesInfo={}, numIterations=10, maxDepth=depth, learningRate=0.25, maxBins=54) #print model.toDebugString() errors[depth] = {} dataSets = {'train': trainingData, 'test': testData} for name in dataSets.keys(): # Calculate errors on train and test sets data = dataSets[name] Predicted = model.predict(data.map(lambda x: x.features)) LabelsAndPredictions = data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter( lambda (v, p): v != p).count() / float(data.count()) errors[depth][name] = Err print depth, errors[depth] # In[ ]:
# Gradient boosted trees if 'GBT_mllib' in model_list: t0 = datetime.datetime.now() # Only change parameters here GBT_mllib_par_dict = {} #GBT_mllib_par_dict['numClasses'] = 2 GBT_mllib_par_dict['loss'] = 'logLoss' GBT_mllib_par_dict['maxDepth'] = 10 GBT_mllib_par_dict['numIterations'] = 40 GBT_mllib_par_dict['learningRate'] = 0.1 model_mllib_GBT = GradientBoostedTrees.trainClassifier( trainingData_rdd, categoricalFeaturesInfo={}, learningRate=GBT_mllib_par_dict['learningRate'], loss=GBT_mllib_par_dict['loss'], numIterations=GBT_mllib_par_dict['numIterations'], maxDepth=GBT_mllib_par_dict['maxDepth']) mllib_model_accuracy('GBT', model_mllib_GBT, trainingData_rdd, testData_rdd) modelparameter_dict['GBT_mllib'] = GBT_mllib_par_dict runtime_write('GBT_mllib', t0) ############################################################################### # Logistic regression if 'logreg_mllib' in model_list: t0 = datetime.datetime.now()
print("Number of test set rows: %d" % test_data.count()) # COMMAND ---------- # MAGIC %md ### Train Gradient Boosted trees and Random Forest model # COMMAND ---------- from pyspark.mllib.tree import RandomForest from time import * from pyspark.mllib.tree import GradientBoostedTrees start_time = time() # Train a model Gradient Boosted Trees modelGBT = GradientBoostedTrees.trainClassifier(training_data, categoricalFeaturesInfo={}) end_time = time() elapsed_time_GBT = end_time - start_time print("Time to train GBT model: %.3f seconds" % elapsed_time_GBT) # Train a model Random Forest start_time = time() model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \ numTrees=3, featureSubsetStrategy="auto", impurity="gini", \ maxDepth=4, maxBins=32, seed=SEED) end_time = time() elapsed_time_RF = end_time - start_time print("Time to train Random Forest model: %.3f seconds" % elapsed_time_RF)
def main(): sc = SparkContext('local[15]', 'haha') # sc._conf.set("spark.python.profile", "true") print(sc.getConf().getAll()) d = load(sc) data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf'] data_train_p, label_train_gt = d['train_tfidf'], d['train_gt'] data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw'] data_train_lp = data_train_lp.sample(False, 0.01) # print(sum(data_train_lp.first()[0])) # print(data_train_lp.zipWithIndex().collect()) print(data_train_lp.take(2)) print("___________train_bayes_____________") sys.stdout.flush() nb = NaiveBayes.train(data_train_lp) print("___________trained_bayes___________") sys.stdout.flush() # nb.save(sc, 'bayes.model') bayes_result_dev = nb.predict(data_dev_p).map(int) bayes_result_dev.count() bayes_result_train = nb.predict(data_train_p).map(int) bayes_result_train.count() bayes_result_test = nb.predict(test_p).map(int) bayes_result_test.count() print("train info:") valid(bayes_result_train, label_train_gt) print("dev info:") valid(bayes_result_dev, label_dev_gt) print("___________train_logistic_____________") sys.stdout.flush() lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005) print("___________trained_logisitc___________") sys.stdout.flush() # lg.save(sc, 'logistic.model') logistic_result_dev = lg.predict(data_dev_p).map(int) logistic_result_train = lg.predict(data_train_p).map(int) logistic_result_test = lg.predict(test_p).map(int) print("train info:") valid(logistic_result_train, label_train_gt) print("dev info:") valid(logistic_result_dev, label_dev_gt) fused_train_p = stack_label([bayes_result_train, logistic_result_train]) fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev]) fused_test_p = stack_label([bayes_result_test, logistic_result_test]) fused_train_lp = label(data_train, fused_train_p) print("___________train_GBDT___________") sys.stdout.flush() gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {}) print('___________trained_GBDT_________') sys.stdout.flush() fused_result_train = gbdt.predict(fused_train_p) fused_result_dev = gbdt.predict(fused_dev_p) fused_result_test = gbdt.predict(fused_test_p) print("train info:") valid(fused_result_train, label_train_gt) print("dev info:") valid(fused_result_dev, label_dev_gt) dump(fused_result_test.map(int).collect()) sc.show_profiles()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
# full dataset for cluster Data1 = Data.sample(False, 0.1).cache() (trainingData, testData) = Data1.randomSplit([0.7, 0.3], seed=255) # subset of dataset for local testing #Data1=Data.sample(False,0.01).cache() #(trainingData,testData)=Data1.randomSplit([0.7,0.3],seed=255)#.cache() # gradient boosted tree model errors = {} for depth in [10]: for lr in [0.2]: for numiter in [20]: model = GradientBoostedTrees.trainClassifier( trainingData, categoricalFeaturesInfo={}, numIterations=numiter, maxDepth=depth, learningRate=lr) errors[depth] = {} dataSets = {'train': trainingData, 'test': testData} for name in dataSets.keys( ): # Calculate errors on train and test sets data = dataSets[name] Predicted = model.predict(data.map(lambda x: x.features)) LabelsAndPredictions = data.map(lambda LP: LP.label).zip( Predicted) Err = LabelsAndPredictions.filter( lambda (v, p): v != p).count() / float(data.count()) errors[depth][name] = Err print depth, errors[depth] #,int(time()-start),'seconds'
Data=inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint(1.0, V[:-1]) if V[-1] == 2.0 else LabeledPoint(0.0, V[:-1])).cache() # ### Reducing data size # In[11]: (trainingData,testData)=Data.randomSplit([0.7,0.3],seed=255) trainingData.cache() testData.cache() # ### Gradient Boosted Trees # In[13]: from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel errors={} for depth in [14]: model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=15, maxDepth=depth) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]
# * **maxDepth** – Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 3) # * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories # # # * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are: # * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context. # * `load(sc,path)` : The counterpart to save - load classifier from file. # * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints. # * `toDebugString()` : print the classifier in a human readable format. errors={} catInfo = {} for i in range(10,54): catInfo[i] = 2 depth = 13 model=GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo=catInfo,maxDepth=depth,numIterations=13,learningRate = 0.15) #print model.toDebugString() errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth] # coding: utf-8
Err = 0.0 results = [] for train_index, test_index in ss: X_training, Y_training, X_test, Y_test = [], [], [], [] for i in train_index: X_training.append(X[i]) Y_training.append(Y[i]) for i in test_index: X_test.append(X[i]) Y_test.append(Y[i]) parsedData = [] for i in range(0, len(X_training)): parsedData.append(LabeledPoint(Y_training[i], X_training[i])) model = GradientBoostedTrees.trainClassifier(sc.parallelize(parsedData), {}, numIterations=10) testErr = 0 for i in range(0, len(X_test)): a = Y_test[i] b = model.predict(X_test[i]) #b = 1 if a != b: testErr += 1 Err += float(testErr) / float(len(X_test)) print ("AVG test error: %.6f" % (Err/iter_number))
StringIndexer(inputCol="osv", outputCol="osvIndex", handleInvalid="keep"), OneHotEncoderEstimator(inputCols=["osIndex", "osvIndex"],outputCols=["osVec", "osvVec"], handleInvalid='keep')]) pipeline_model = pipeline.fit(trainingData) train = pipeline_model.transform(trainingData) test = pipeline_model.transform(testData) def turn_labelpoint(x): label = int(x.label) features_list = np.array(list(x['osVec']) + list(x['osvVec'])) features_vec = Vectors.sparse(len(features_list), list(np.where(features_list>0)[0]), list(features_list[features_list>0])) return LabeledPoint(label, features_vec) train_labelPoint = train.rdd.map(turn_labelpoint) test_labelPoint = test.rdd.map(turn_labelpoint) gbdt_model = GradientBoostedTrees.trainClassifier(data=train_labelPoint,categoricalFeaturesInfo={}, numIterations=30, maxDepth=3) tree_num = gbdt_model.numTrees() print('tree_num',tree_num) gbdt_model_trees = GbdtModelTrees(gbdt_model.toDebugString()) # for t in gbdt_model_trees.trees: # t.pre_order(t.root) # print('--------------------') #sys.exit(-1) """ categoricalFeaturesInfo:向量中为分类属性的索引表。任务没有出现在该列表中的特征将会以连续值处理。{n:k}表示第 n 个特征,是 0-k 的分类属性。 """ def getTreeLeafMap(): feature_map = dict()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
#model=SVMWithSGD.train(train, 1.0) ### Change XCA # TODO We are testing several MLs # 1) LogisticsRegression #model =LogisticRegressionwWithSGD.train(train) This is used for Logistic regression classification # 2) SVM Classification #model=SVMWithSGD.train(train) This used for SVM classiffication # 3) RandomForest #************Random forest model in pyspark is experimental so not sure whether works perfectly or not #model=RandomForest.trainClassifier(train,2,{},300,seed=2) here 300 is best solution as per literature for this dataset ##### from doc #model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, # numTrees=3, featureSubsetStrategy="auto", # impurity='gini', maxDepth=4, maxBins=32) # Gradient Boost model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) print "retrieving predictions and evaluating" predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "accuracy for GradientBoostedTrees:"+str(accuracy)