def func2(): """ PIPeline机器学习 :return: """ row_df = sqlContext.read.format("csv").option("header", True).option("delimiter", "\t").load(Path + "train.tsv") df = row_df.select(["url", "alchemy_category"] # 不需要转换的字段 + [replace_question(col(column)).cast("double").alias(column) for column in row_df.columns[4:]]) # 需要转换的字段 train_df, test_df = df.randomSplit([0.7, 0.3]) ###建立机器学习Pipeline流程 stringIndexer = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index") # 创建indexer,字符串代码化 encoder=OneHotEncoder(dropLast=False,inputCol="alchemy_category_index",outputCol="alchemy_category_indexVec") assemblerInputs = ["alchemy_category_indexVec"] + row_df.columns[4:-1] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14) pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,dt]) print(pipeline.getStages()) ###使用Pipeline进行数据处理和训练 pipelineModel=pipeline.fit(train_df)#训练 print(pipelineModel.stages[3])#第三阶段会产生模型,这里看看模型 print(pipelineModel.stages[3].toDebugString) ####使用pipeline进行预测 predicted=pipelineModel.transform(test_df) print(predicted.columns) predicted.select("url","features","rawprediction","probability","label","prediction").show(5) predicted.select( "probability", "prediction").take(5) ####评估模型准确率 evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label",metricName="areaUnderROC") auc=evaluator.evaluate(predicted) print("auc:",auc) #要遍历的参数们,选择最佳参数组合 paramGrid=ParamGridBuilder().addGrid(dt.impurity,["gini","entory"]).addGrid(dt.maxDepth,[5,10,15]).addGrid(dt.maxBins,[10,15,20]).build() tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)#trainRatio 数据会8:2的比例分为训练集,验证集 tvs_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,tvs]) tvs_pipelineModel=tvs_pipeline.fit(train_df) bestModel=tvs_pipelineModel.stages[3].bestModel print("bestModel",bestModel) predictions=tvs_pipelineModel.transform(test_df) auc2=evaluator.evaluate(predictions) print("auc2:",auc2)
# 可以看到本质上存储的是SparseVector类型 print(df3.select('features').take(1)) # 5, 使用DecionTreeClassifier二元分类 dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity='gini', maxDepth=10, maxBins=14) dt_model = dt.fit(df3) print(dt_model) dt4 = dt_model.transform(df3) # 6, 建立pipeline pipeline = Pipeline(stages=[categoryIndexer, encoder, assembler, dt]) print(pipeline.getStages()) # 7, 使用pipeline进行数据处理与训练 # 因为训练数据执行pipeline的所有阶段,所以会花时间比较长,最后产生的结果是pipelineModel pipelineModel = pipeline.fit(train_df) print(pipelineModel.stages[3]) # 我们还可以进一步使用toDebugString查看决策树模型的规则 print(pipelineModel.stages[3].toDebugString) # 8, 使用pipelineModel进行预测 predicted = pipelineModel.transform(test_df) # 查看预测后的Schema,发现新增了3个字段 print(predicted.columns) predicted.select('url', 'features', 'rawprediction', 'probability', 'label', 'prediction').show(10)
cat_dist[idx].labels[i]) print("===========SplitData====================") train_df, test_df = df.randomSplit(env.split_prop) print("===========VectorAssembler====================") feature = df.columns[1:len(df.columns) - 1] assembler = VectorAssembler(inputCols=feature, outputCol="features") print("=============pipeline==================") model = LinearSVC(maxIter=5, regParam=0.01, labelCol=lable_name[0], featuresCol="features") pipeline = Pipeline(stages=[assembler, model]) pipeline.getStages() print("===========TaintingAndTesting====================") pipelineModel = pipeline.fit(train_df) predicted = pipelineModel.transform(test_df) print("===========PredictedAUC====================") evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol=lable_name[0], metricName="areaUnderROC") auc = evaluator.evaluate(predicted) print(auc) print("===========PredictedScore====================") Multi_evaluator = MulticlassClassificationEvaluator(labelCol=lable_name[0]) Accuracy = Multi_evaluator.evaluate(predicted,
def magic_loop3(pipelines, grid, train, test, cvfolds=3): best_score = 0.0 #symbolic high value :-) best_grid = None #inicializar la variable #este loop inicia las pruebas secuenciales de los pipelines: #es relevante que no sólo soporta 2, sino se va en cada uno #de los que estén presentes en la lista for pipe in pipelines: try: #quiero que se desligue, para no modificar (al final, usa poco RAM) pipe = pipe.copy() #etapas del pipeline stages = pipe.getStages() #obtener el predictor (el motor ML) predictor = [ stage for stage in stages if "pyspark.ml.classification" in str(type(stage)) or "pyspark.ml.regression" in str(type(stage)) ][0] predictor_i = stages.index(predictor) stringer = [ stage for stage in pipe.getStages() if "pyspark.ml.feature.StringIndexer" in str(type(stage)) ][0] if DEBUG: print("pipeline:\n%s\n\n" % stages) if DEBUG: print("predictor=%s (index %s, type %s), stringer=%s (%s)\n" % (predictor, stages.index(predictor), type(predictor), stringer, type(stringer))) #dado que no son predicciones susceptibles a cambios en el CV prepipe = Pipeline(stages=stages[0:(predictor_i)]) if DEBUG: print("pre pipeline:\n%s\n\n" % prepipe.getStages()) print("Starting fit on prepipe...") #modelo para el prepipeline prepipem = prepipe.fit(train) print("Starting transform on prepipe...") train2 = prepipem.transform(train) test2 = prepipem.transform(test) #el motor ML sí es susceptible CV postpipe = Pipeline(stages=stages[(predictor_i):len(stages)]) if DEBUG: print("post pipeline:\n%s\n\n" % postpipe.getStages()) #creación del Cross Validator gridcv = CrossValidator( estimator=postpipe, estimatorParamMaps=grid, evaluator=MulticlassClassificationEvaluator( labelCol="DEPARTURE_DELAY"), numFolds=cvfolds) #extraemos el nombre y le quitamos la parte "fea" que devuelve type() predictr = [ str(type(stage)) for stage in pipe.getStages() if "pyspark.ml.classification" in str(type(stage)) or "pyspark.ml.regression" in str(type(stage)) ][0] predictr = re.sub( "^<class 'pyspark\.ml\.(classification|regression)\.([a-zA-Z0-9]+)'>", "\\2", predictr) #creación del modelo print("Starting fit on %s..." % predictr) gridcvm = gridcv.fit(train2) #aplicación del modelo print("Starting transform on %s..." % predictr) preds = gridcvm.transform(test2) #obtenemos el evaluador para el uso en la medición de errores ev = gridcvm.getEvaluator() #obtenemos la métrica del error metric = ev.getMetricName() print("Starting error calculation on %s..." % predictr) #obtenemos el valor del eror error = ev.evaluate(preds) print("Error %s: %f" % (metric, error)) #si es mejor que el modelo pasado, lo guardamos. El último #guardado será el que devuelva esta función if error > best_score: print("%s is the best model so far: %f (%s)" % (predictr, error, metric)) best_grid = gridcvm #manejo de errores y horrores except Exception as e: print('Error during Magic Loop:', e) continue return best_grid
#parametros para el magic loop paramGrid = ParamGridBuilder() \ .addGrid(glr.family, ["Gaussian", "Poisson", "Tweedie"]) \ .addGrid(glr.maxIter, [1, 2, 3]) \ .addGrid(lr.maxIter, [1, 2, 3]) \ .addGrid(lr.elasticNetParam, [0.1,0.2,0.3]) \ .build() magic = magic_loop3(pipelines, paramGrid, train, test, 3) #obtengo el pipeline que devolvió el magic loop best_model = magic.getEstimator() #obtenemos el paso del clasificador best_estimator = best_model.getStages()[0] #guardo en una variable los parámetros más adecuados best_estimator_params = best_estimator.extractParamMap() #obtener el evaluador para medir errores ev0 = magic.getEvaluator() ################### #medición del error #impresión de los parámetros y el mejor modelo print( "%s (best model) Parameters: maxIter=%d, elasticNetParam=%f" % (re.sub( "^<class 'pyspark\.ml\.(classification|regression)\.([a-zA-Z0-9]+)'>", "\\2", (str)((type(best_estimator)))), best_estimator_params[best_estimator.maxIter], best_estimator_params[best_estimator.elasticNetParam])) #aquí probé como se veían los errores preds = magic.transform( Pipeline(stages=pipeline1.getStages()[0:6]).fit(test).transform(test)) print("Error %s: %f" % (ev0.getMetricName(), ev0.evaluate(preds)))
class pipemodeler: def __init__(self, DF, featurestages, classifier, classifiergrid=None): self.DF = DF self.featurestages = featurestages self.classifier = classifier self.classifiergrid = classifiergrid def train(self): print('Building stages...') stages = [] if (type(self.featurestages) != list): self.featurestages = [self.featurestages] stages += self.featurestages #In case there is word2vec which has negative features, scale the features #to nonnegative values because naive bayes requires that if (('Word2Vec' in str(stages)) and ('NaiveBayes' in str(self.classifier))): print( 'Word2Vec and NaiveBayes detected, scaling to nonnegative [0.0,1.0]' ) stages[-1].setOutputCol('prefeatures') scaler = MinMaxScaler(inputCol='prefeatures', outputCol='features') stages = stages + [scaler] stages += [self.classifier] self.pipeline = Pipeline(stages=stages) print('Using the following stages: ' + str(self.pipeline.getStages())) print('Training model...') if (self.classifiergrid == None): print('Training without a Parameter Grid...') dftrain, dftest = self.DF.randomSplit([0.80, 0.20]) model = self.pipeline.fit(dftrain) self.predictions = model.transform(dftest) self.model = model else: # print('Training with a Parameter Grid...') # tvs = TrainValidationSplit(estimator=self.pipeline, # estimatorParamMaps=self.classifiergrid, # evaluator=BinaryClassificationEvaluator(), # parallelism=4s, # trainRatio=0.7) # dftrain, dftest = self.DF.randomSplit([0.70, 0.30]) # model = tvs.fit(dftrain) print('Cross Validation Hyperparamter Tunning...') cv = CrossValidator(estimator=self.pipeline, estimatorParamMaps=self.classifiergrid, evaluator=BinaryClassificationEvaluator(), parallelism=4, numFolds=5) dftrain, dftest = self.DF.randomSplit([0.70, 0.30]) model = cv.fit(dftrain) self.predictions = model.transform(dftest) self.model = model def performancerdd(self): self.calculator = 'RDDs' print('Calculating performance metrics using RDDs...') predictionRDD = self.predictions.select( ['label', 'prediction']).rdd.map(lambda line: (line[1], line[0])) binmetrics = BinaryClassificationMetrics(predictionRDD) metrics = MulticlassMetrics(predictionRDD) self.areaUnderROC = binmetrics.areaUnderROC self.areaUnderPR = binmetrics.areaUnderPR self.confusionMatrix = metrics.confusionMatrix().toArray() self.accuracy = metrics.accuracy self.precision = metrics.precision() self.recall = metrics.recall() self.f1measure = metrics.fMeasure() self.falsePositive = metrics.falsePositiveRate(1.0) self.falseNegative = metrics.falsePositiveRate(0.0) def performance(self): self.calculator = 'Nothing' print('Calculating performance metrics using nothing...') evaluator = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction") self.areaUnderROC = evaluator.evaluate(self.predictions) preds = self.predictions fp = preds.filter(preds.label < preds.prediction).count() fn = preds.filter(preds.label > preds.prediction).count() tp = preds.filter(preds.label == 1.0).filter( preds.prediction == 1.0).count() tn = preds.filter(preds.label == 0.0).filter( preds.prediction == 0.0).count() total = fp + fn + tp + tn self.confusionMatrix = [[tn, fn], [fp, tp]] self.accuracy = (tp + tn) / total if (tp + fp): self.precision = tp / (tp + fp) else: self.precision = 0 if (tp + fn): self.recall = tp / (tp + fn) else: self.recall = 0 if (self.precision + self.recall): self.f1measure = 2 * self.precision * self.recall / ( self.precision + self.recall) else: self.f1measure = 0 if (fp + tn): self.falsePositive = fp / (fp + tn) else: self.falsePositive = 0 if (fn + tp): self.falseNegative = fn / (fn + tp) else: self.falseNegative = 0 def printperformance(self): print('Stages: ' + str(self.pipeline.getStages())) print('Performance calculated using ' + self.calculator) print('areaUnderROC = ' + str(self.areaUnderROC)) # print('areaUnderPR = ' + str(self.areaUnderPR)) print('confusionMatrix:') print(self.confusionMatrix) print('accuracy = ' + str(self.accuracy)) print('precision = ' + str(self.precision)) print('recall = ' + str(self.recall)) print('f1measure = ' + str(self.f1measure)) print('falsePositive = ' + str(self.falsePositive)) print('falseNegative = ' + str(self.falseNegative))
def func1(): hour_df = sqlContext.read.format("csv").option( "header", "true").load(Path + "hour.csv") print("count", hour_df.count()) print("columns:", hour_df.columns) #舍弃不需要的字段 hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop( "casual").drop("registered") print("查看schema:", hour_df.printSchema()) # 数据转换为double hour_df = hour_df.select([ col(column).cast("double").alias(column) for column in hour_df.columns ]) print("转换后:hour_df.printSchema():", hour_df.printSchema()) print("前3项数据:", hour_df.show(3)) # 将数据分为train_df和test_df,比例为0.7:0.3 train_df, test_df = hour_df.randomSplit([0.7, 0.3]) train_df.cache() test_df.cache() # 创建特征字段list featureCols = hour_df.columns[:-1] print("featureCols:", featureCols) # 建立pipeline vectorAssembler = VectorAssembler(inputCols=featureCols, outputCol="aFeatures") vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24) dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features") dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt]) print("查看pipeline流程:", dt_pipeline.getStages()) # 训练 dt_pipelineModel = dt_pipeline.fit(dataset=train_df) print("查看训练完成后的模型:", dt_pipelineModel.stages[2].toDebugString[:500]) # 使用transform预测 predicted = dt_pipelineModel.transform(test_df) print("查看新增的字段:", predicted.columns) print("查看预测的结果:", predicted.show(2)) ###评估模型 evaluator = RegressionEvaluator(labelCol="cnt", predictionCol="prediction", metricName="rmse") predicted_df = dt_pipelineModel.transform(test_df) rmse = evaluator.evaluate(predicted_df) print("rmse:", rmse) ##TrainValidationSplit训练找出最佳模型 paramGrid = ParamGridBuilder().addGrid( dt.impurity, ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid( dt.maxBins, [10, 15, 20]).build() tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8) # trainRatio 数据会8:2的比例分为训练集,验证集 tvs_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, tvs]) yvs_pipelineModel = tvs_pipeline.fit(dataset=train_df) bestmodel = yvs_pipelineModel.stages[2].bestModel print("bestModel:", bestmodel.toDebugString[:500]) ##使用最佳模型进行预测 predictions = tvs_pipeline.transform(test_df) rmse2 = evaluator.evaluate(predictions) print(rmse2)
en_coeffs_df.query('weight == 0.0').shape[0]/en_coeffs_df.shape[0] # In[49]: en_coeffs_df.query('weight == 0.0').head(15) # In[50]: from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit # In[51]: en_lr_estimator.getStages() # In[52]: grid = ParamGridBuilder(). addGrid(en_lr.regParam, [0., 0.01, 0.02]). addGrid(en_lr.elasticNetParam, [0., 0.2, 0.4]). build() # In[53]: grid # In[54]: all_models = []
def func1(): rawData = sc.textFile(Path + "covtype.data", minPartitions=40) lines = rawData.map(lambda x: x.split(",")) print("lines.count()::", lines.count()) fieldNum = len(lines.first()) print("字段数:", fieldNum) fields = [ StructField(name="f" + str(i), dataType=StringType, nullable=True) for i in range(fieldNum) ] schema = StringType(fields) covtype_df = sqlContext.createDataFrame(data=lines, schema=schema) print("covtype_df.columns::", covtype_df.columns) print("covtype_df.printSchema()::", covtype_df.printSchema()) #数据转换为double covtype_df = covtype_df.select([ col(column).cast("double").alias(column) for column in covtype_df.columns ]) print("装换后:covtype_df.printSchema():", covtype_df.printSchema()) #创建特征字段list featureCols = covtype_df.columns[:54] print("featureCols:", featureCols) #设置label字段,第54个字段是label,值范围是1-7,但是训练需从0开始,所以covtype_df["f54"]-1,表示将值都范围转到0-6了 covtype_df = covtype_df.withColumn(colName="label", col=covtype_df["f54"] - 1).drop("f54") print("第一项数据:", covtype_df.show(1)) #将数据分为train_df和test_df,比例为0.7:0.3 train_df, test_df = covtype_df.randomSplit([0.7, 0.3]) train_df.cache() test_df.cache() #建立pipeline vectorAssembler = VectorAssembler(inputCols=featureCols, outputCol="features") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, maxBins=20) dt_pipeline = Pipeline(stages=[vectorAssembler, dt]) print("查看pipeline流程:", dt_pipeline.getStages()) #训练 pipelineModel = dt_pipeline.fit(dataset=train_df) print("查看训练完成后的模型:", pipelineModel.stages[1].toDebugString[:500]) #使用transform预测 predicted = pipelineModel.transform(test_df) print("查看新增的字段:", predicted.columns) print("查看预测的结果:", predicted.show(2)) ###评估模型 evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predicted) print("accuracy::", accuracy) ##TrainValidationSplit训练找出最佳模型 paramGrid = ParamGridBuilder().addGrid( dt.impurity, ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid( dt.maxBins, [10, 15, 20]).build() tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8) # trainRatio 数据会8:2的比例分为训练集,验证集 tvs_pipeline = Pipeline(stages=[vectorAssembler, tvs]) pipelineModel = tvs_pipeline.fit(dataset=train_df) bestmodel = pipelineModel.stages[1].bestModel print("bestModel:", bestmodel.toDebugString[:500]) ##使用最佳模型进行预测 predictions = tvs_pipeline.transform(test_df) result=predictions.withColumnRenamed("f0","海拔").withColumnRenamed("f1", "方位").withColumnRenamed("f2","斜率")\ .withColumnRenamed("f3","垂直距离").withColumnRenamed("f4","水平距离").withColumnRenamed("f5","阴影") result.select("海拔", "方位", "斜率", "垂直距离", "水平距离", "阴影", "label", "prediction").show(10) accuracy2 = evaluator.evaluate(predictions) print("accuracy2:", accuracy2)
preprocessed_data.cache() # split data to train/test 80/20 train_preprocessed_data = preprocessed_data.randomSplit([.8,.2])[0] train_preprocessed_data.cache() #model gbmodel = GBTRegressor(featuresCol="features",labelCol=target) # model tuning process evaluator =RegressionEvaluator(labelCol=target) paramGrid = (ParamGridBuilder() .addGrid(gbmodel.maxDepth, [2, 4, 6]) .addGrid(gbmodel.maxBins, [20, 60]) .addGrid(gbmodel.maxIter, [10, 20]) .addGrid(gbmodel.minInfoGain, [0.0, 0.05]) .build()) cv = CrossValidator(estimator=gbmodel, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) pipeline_cv = cv.fit(train_preprocessed_data) # final pipeline to deploy is the preprocessing steps + best model (best hyperparameters) final_pipeline = Pipeline(stages=[*preprocessing_pipeline.getStages(), pipeline_cv.bestModel]) #train on all data and save model to disk final_model = final_pipeline.fit(scorecard_data_cleaned) final_model.write().overwrite().save(web_app_model_path) sc.stop()