def cv_metrics(model: CrossValidatorModel, output_dir: Path = None) -> dict: num_folds = model.getNumFolds() evaluator_metric = model.getEvaluator().getMetricName() param_maps = model.getEstimatorParamMaps() cv_avg_metrics = model.avgMetrics grid = [] for p, m in zip(param_maps, cv_avg_metrics): grid_item = { "params": dict([(str(param), value) for param, value in p.items()]), evaluator_metric: m } grid.append(grid_item) metrics = { "cross_validation_metrics": { "num_folds": num_folds, "evaluator_metric": evaluator_metric, "grid_search": grid } } if output_dir is not None: save_metrics(metrics, output_dir / "cv_metrics.json") return metrics
def _run_test_save_load_trained_model(self, LogisticRegressionCls, LogisticRegressionModelCls): # This tests saving and loading the trained model only. # Save/load for CrossValidator will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True, numFolds=4, seed=42, ) cvModel = cv.fit(dataset) lrModel = cvModel.bestModel lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) loadedLrModel = LogisticRegressionModelCls.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) # SPARK-32092: Saving and then loading CrossValidatorModel should not change the params cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedCvModel = CrossValidatorModel.load(cvModelPath) for param in [ lambda x: x.getNumFolds(), lambda x: x.getFoldCol(), lambda x: x.getSeed(), lambda x: len(x.subModels), ]: self.assertEqual(param(cvModel), param(loadedCvModel)) self.assertTrue( all(loadedCvModel.isSet(param) for param in loadedCvModel.params)) # mimic old version CrossValidatorModel (without stdMetrics attribute) # test loading model backwards compatibility cvModel2 = cvModel.copy() cvModel2.stdMetrics = [] cvModelPath2 = temp_path + "/cvModel2" cvModel2.save(cvModelPath2) loadedCvModel2 = CrossValidatorModel.load(cvModelPath2) assert loadedCvModel2.stdMetrics == []
def fit(self, dataset): java_estimator, java_epms, java_evaluator = self._to_java_impl() self._java_obj.setEstimator(java_estimator) self._java_obj.setEvaluator(java_evaluator) self._java_obj.setEstimatorParamMaps(java_epms) cv_java_model = self._java_obj.fit(dataset._jdf) cv_py_model = CrossValidatorModel._from_java(cv_java_model) xgbModel = self.getEstimator()._create_model(cv_java_model.bestModel()) # return CrossValidatorModel return CrossValidatorModel(xgbModel, cv_py_model.avgMetrics, cv_py_model.subModels)
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() numFolds = 3 cv = CrossValidator( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=numFolds, collectSubModels=True, ) def checkSubModels(subModels): self.assertEqual(len(subModels), numFolds) for i in range(numFolds): self.assertEqual(len(subModels[i]), len(grid)) cvModel = cv.fit(dataset) checkSubModels(cvModel.subModels) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testCrossValidatorSubModels" savingPathWithSubModels = testSubPath + "cvModel3" cvModel.save(savingPathWithSubModels) cvModel3 = CrossValidatorModel.load(savingPathWithSubModels) checkSubModels(cvModel3.subModels) cvModel4 = cvModel3.copy() checkSubModels(cvModel4.subModels) savingPathWithoutSubModels = testSubPath + "cvModel2" cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels) self.assertEqual(cvModel2.subModels, None) for i in range(numFolds): for j in range(len(grid)): self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def DecisionTree(data): path = 'modelo_DecisionTree/modelDecisionTree' DecisionTree = CrossValidatorModel.load(path) predictions = DecisionTree.transform(data) print("DECISION TREE") predictions.select('Email', 'Identificador', 'Burnout_Antes', 'prediction', 'probability').show(truncate=False)
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) metrics = [0.0] * numModels stratified_data = self.stratify_data(dataset) for i in range(nFolds): train_arr = [x for j, x in enumerate(stratified_data) if j != i] train = reduce((lambda x, y: x.unionAll(y)), train_arr) validation = stratified_data[i] models = est.fit(train, epm) for j in range(numModels): model = models[j] metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric / nFolds if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics))
def regression(df, column, name): try: model = CrossValidatorModel.load("data/{}.model".format(name)) except: LR = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) if name[3] == 'P': LR.setThreshold(0.2) else: LR.setThreshold(0.25) eval = BinaryClassificationEvaluator() paramGrid = ParamGridBuilder().addGrid(LR.regParam, [1.0]).build() crossval = CrossValidator(estimator=LR, evaluator=eval, estimatorParamMaps=paramGrid, numFolds=5) # train, test = df.select('features', func.col(column).alias("label")).randomSplit([0.5, 0.5]) print("Training '{}' classifier... Please wait".format(name)) model = crossval.fit(df.select("*", func.col(column).alias("label"))) model.save("data/{}.model".format(name)) # df_test = model.transform(df) # df_test.filter(df_test.prediction == 1).show() return model
def test_save_load_simple_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def load_model(model_path): """ Load a pretrained model """ model = CrossValidatorModel.load(model_path) return model
def _fit(self, dataset): est = self.estimator epm = self.estimatorParamMaps numModels = len(epm) eva = self.evaluator metricName = eva.getMetricName() nFolds = self.numFolds metrics = [0.0] * numModels stratified_data = self.stratify_data(dataset) for i in range(nFolds): print(f"Initiating Training for fold {i + 1}") train = stratified_data.filter(stratified_data["bucket_fold"] != i) validation = stratified_data.filter( stratified_data["bucket_fold"] == i) models = est.fit(train, epm) for j in range(numModels): model = models[j] metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric / nFolds if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics))
def test_save_load_simple_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) metricName = eva.getMetricName() nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels for i in range(nFolds): foldNum = i + 1 print("Comparing models on fold %d" % foldNum) validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition) train = df.filter(~condition) for j in range(numModels): paramMap = epm[j] model = est.fit(train, paramMap) # TODO: duplicate evaluator to take extra params from input metric = eva.evaluate(model.transform(validation, paramMap)) metrics[j] += metric avgSoFar = metrics[j] / foldNum print("params: %s\t%s: %f\tavg: %f" % ({param.name: val for (param, val) in paramMap.items() }, metricName, metric, avgSoFar)) for (param, val) in paramMap.items(): a.append(param.name) b.append(val) c.append(metric) if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestParams = epm[bestIndex] bestModel = est.fit(dataset, bestParams) avgMetrics = [m / nFolds for m in metrics] bestAvg = avgMetrics[bestIndex] print("Best model:\nparams: %s\t%s: %f" % ({param.name: val for (param, val) in bestParams.items()}, metricName, bestAvg)) return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))
def process(spark, input_path, output_file): ads_test = spark.read.parquet(input_path) # Загрузка модели, скоринг и запись output = CrossValidatorModel.load('spark_ml_model').transform(ads_test) cols = ['ad_id', 'prediction'] output = output.select(*cols) output.write.option('header', 'true').csv(str(output_file))
def LinearEvaluation(data): path = 'modelo_LogisticRegression/modelLogisticRegression' lrModel = CrossValidatorModel.load(path) #print(lrModel.coefficientMatrix) #predictions=lrModel.transform(data) predictions = lrModel.transform(data) #VERDADERO = 0 Y FALSO 1 print("LINEAR EVALUATION") predictions.select('Email', 'Identificador', 'Burnout_Antes', 'prediction', 'probability').show(truncate=False)
def main(context): """Main function takes a Spark SQL context.""" comments_df = context.read.parquet("comments.parquet") submissions_df = context.read.parquet("submissions.parquet") labeled_data_df = context.read.parquet("labeled_data.parquet") if path.exists("df_label.parquet"): labeled_df = context.read.parquet("df_label.parquet") comments_df = cleanedCommentDF(comments_df) else: labeled_df = createLabeledDF(comments_df, labeled_data_df) comments_df = cleanedCommentsDF(comments_df) if path.exists("cvModel"): cvModel = CountVectorizerModel.load("cvModel") posModel = CrossValidatorModel.load("pos.model") negModel = CrossValidatorModel.load("neg.model") else: cvModel, posModel, negModel = train(labeled_df) # the "final join" without actually joining! output = cvModel.transform(comments_df) output = output.drop('score') output = output.drop('ngrams_combined') output = output.drop('link_id_cleaned') posResult = posModel.transform(output) posResult = posResult.drop('rawPrediction') posResult = posResult.drop('prediction') posResult = posResult.withColumnRenamed('probability', 'pos_prob') fullResult = negModel.transform(posResult) fullResult = fullResult.withColumnRenamed('probability', 'neg_prob') fullResult = fullResult.drop('rawPrediction') fullResult = fullResult.drop('prediction') fullResult = fullResult.withColumn( 'neg', when(get_probability_udf(fullResult.neg_prob) > 0.25, 1).otherwise(0)) fullResult = fullResult.withColumn( 'pos', when(get_probability_udf(fullResult.pos_prob) > 0.2, 1).otherwise(0)) fullResult.write.parquet("resulting_df.parquet") #fullResult_df = context.read.parquet("resulting_df.parquet") print(fullResult.count())
def task9(context, entire_file, model): posModel = CrossValidatorModel.load("project2/pos.model") negModel = CrossValidatorModel.load("project2/neg.model") data = task45(context, entire_file) rr_data = model.transform(data) rr_data.createOrReplaceTempView("Data") new_data = context.sql( "SELECT * FROM Data WHERE Data.body NOT LIKE '%>%' OR Data.body NOT LIKE '%/s%'" ) posResult = posModel.transform(new_data) posResult.createOrReplaceTempView("Pos") new_data_with_pos = context.sql( "SELECT author_flair_text, created_utc, submissionsScore, commentsScore, title, id, body, grams, count_vectors, probability as prob_pos FROM Pos" ) result = negModel.transform(new_data_with_pos) result.createOrReplaceTempView("Result") formatted_result = context.sql( "SELECT author_flair_text, created_utc, title, submissionsScore, commentsScore, id, body, grams, count_vectors, prob_pos, probability as prob_neg FROM Result" ) # Used this link to get the first element in the probability column # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability?noredirect=1&lq=1 firstelement = udf(lambda v: float(v[1]), FloatType()) pos_udf = udf(getPosProbs, IntegerType()) neg_udf = udf(getNegProbs, IntegerType()) res = formatted_result.select(firstelement('prob_neg'), firstelement('prob_pos'), 'author_flair_text', 'created_utc', 'title', 'id', 'body', 'count_vectors', 'commentsScore', 'submissionsScore') nResult = res.withColumn("pos", pos_udf(res["<lambda>(prob_pos)"])) new_result = nResult.withColumn("neg", neg_udf(nResult["<lambda>(prob_neg)"])) new_result.createOrReplaceTempView("NewResult") actual_result = context.sql( "SELECT author_flair_text, created_utc, title, id, body, count_vectors, commentsScore, submissionsScore, pos, neg FROM NewResult" ) #new_data.show(n=10) return actual_result
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() numFolds = 3 cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=numFolds, collectSubModels=True) def checkSubModels(subModels): self.assertEqual(len(subModels), numFolds) for i in range(numFolds): self.assertEqual(len(subModels[i]), len(grid)) cvModel = cv.fit(dataset) checkSubModels(cvModel.subModels) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testCrossValidatorSubModels" savingPathWithSubModels = testSubPath + "cvModel3" cvModel.save(savingPathWithSubModels) cvModel3 = CrossValidatorModel.load(savingPathWithSubModels) checkSubModels(cvModel3.subModels) cvModel4 = cvModel3.copy() checkSubModels(cvModel4.subModels) savingPathWithoutSubModels = testSubPath + "cvModel2" cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels) self.assertEqual(cvModel2.subModels, None) for i in range(numFolds): for j in range(len(grid)): self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def main(sqlContext): # Check if the parquet file has already been written if not os.path.exists("superbowl_comments.parquet"): write_parquet(sqlContext) # Read the parquets comments = sqlContext.read.parquet("superbowl_comments.parquet") comments.registerTempTable("commentsTable") # Read the labels csv labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labels.csv") labels.registerTempTable("labelsTable") # Create Dataframe to Train the Model modelDataframe = sqlContext.sql("SELECT commentsTable.comments_id AS id, commentsTable.comments_body AS body, commentsTable.comments_author AS author, commentsTable.comments_created_utc AS created_utc, commentsTable.comments_subreddit_id AS subreddit_id, commentsTable.comments_link_id AS link_id, commentsTable.comments_parent_id AS parent_id, commentsTable.comments_score AS score, commentsTable.comments_controversiality AS controversiality, commentsTable.comments_gilded AS gilded FROM commentsTable INNER JOIN labelsTable ON commentsTable.comments_id = labelsTable.label_id") modelDataframe = create_dataframe(sqlContext, modelDataframe) # Fit the CountVectorizer model if(not os.path.exists("models/cvModel")): train_cv_model(modelDataframe) # Use model to transform the data modelDataframe = transform_model(sqlContext, modelDataframe) modelDataframe.registerTempTable("modelDataframeTable") modelDataframe = sqlContext.sql("SELECT modelDataframeTable.*, IF(labelsTable.label=1, 1, 0) AS pos_label, IF(labelsTable.label=-1, 1, 0) AS neg_label FROM modelDataframeTable INNER JOIN labelsTable ON modelDataframeTable.id = labelsTable.label_id") if(not os.path.exists("models/negModel") or not os.path.exists("models/posModel")): create_models(sqlContext, modelDataframe) # Load the positive and negative models back in posModel = CrossValidatorModel.load("models/posModel") negModel = CrossValidatorModel.load("models/negModel") if(not os.path.exists("fullDataframe.parquet")): create_fullDataframe(sqlContext, comments) # Load the full dataframe back in fullDataframe = sqlContext.read.parquet("fullDataframe.parquet") fullDataframe.registerTempTable("fullDataframeTable") # Get rid of comments that are sarcastic or removed fullDataframe = sqlContext.sql("SELECT * FROM fullDataframeTable WHERE fullDataframeTable.body NOT LIKE '%/s%' AND fullDataframeTable.body NOT LIKE '>%' AND fullDataframeTable.body NOT LIKE '%[removed]%'")
def RandomForest(data): path = 'modelo_RandomForest/modelRandomForest' randomModel = CrossValidatorModel.load(path) predictions = randomModel.transform(data) prediccion = predictions.select('prediction', 'probability').rdd.flatMap(lambda x: x).collect() if prediccion[0] == 1.0: prediccionLabel = 'FALSO' else: prediccionLabel = 'VERDADERO' return prediccionLabel, prediccion[1][0] * 100
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", F.rand(seed).alias(randCol)) metrics = np.zeros((numModels, nFolds)) pool = ThreadPool(processes=min(self.getParallelism(), numModels)) subModels = None collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [[None for j in range(numModels)] for i in range(nFolds)] for i in range(nFolds): if self.sequentialIndex: pass # todo pass a column name to base the split on. make sure the split conforms to sklearn norms. # idx = [1,2,3,4] # training.where(~col("id").isin(idx)).show() else: validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition).cache() train = df.filter(~condition).cache() tasks = self._parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) for j, metric, subModel in pool.imap_unordered( lambda f: f(), tasks): metrics[j, i] = metric if collectSubModelsParam: subModels[i][j] = subModel validation.unpersist() train.unpersist() avgMetrics = np.mean(metrics, axis=1) if eva.isLargerBetter(): bestIndex = np.argmax(avgMetrics) else: bestIndex = np.argmin(avgMetrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues( CrossValidatorModel(bestModel, avgMetrics.tolist(), subModels)), metrics
def LinearEvaluation(data): path = 'modelo_LogisticRegression/modelLogisticRegression' lrModel = CrossValidatorModel.load(path) predictions = lrModel.transform(data) #VERDADERO = 0 Y FALSO 1 prediccion = predictions.select('prediction', 'probability').rdd.flatMap(lambda x: x).collect() if prediccion[0] == 1.0: prediccionLabel='FALSO' else: prediccionLabel='VERDADERO' return prediccionLabel,prediccion[1][0]*100
def predict(self, mysqldetails, method, churn_data_semifinal1): params_res = CrossValidatorModel.load(mname) predict_test = params_res.transform(data_test) X = churn_data_semifinal1 if method.lower() == 'logistic_regression': params_res = CrossValidatorModel.load('./' + mysqldetails + '/model_logistic') elif method.lower() == 'naive_bayes': with open('./' + mysqldetails + '/model_naivebayes.pkl', 'rb') as f: params_res = pickle.load(f) #elif method.lower() == 'xgboost': #params_res=CrossValidatorModel.load('./model_xgboost') elif method.lower() == 'svm': params_res = CrossValidatorModel.load('./' + mysqldetails + '/model_svm') elif method.lower() == 'lightgbm': params_res = CrossValidatorModel.load('./' + mysqldetails + '/model_lightgbm') predict_test = params_res.transform(X) return predict_test.select('mobile', 'prediction')
def DecisionTree(data): path = 'modelo_DecisionTree/modelDecisionTree' DecisionTree = CrossValidatorModel.load(path) predictions = DecisionTree.transform(data) prediccion = predictions.select('prediction', 'probability').rdd.flatMap(lambda x: x).collect() print(prediccion[0]) if prediccion[0] == 1.0: prediccionLabel = 'FALSO' else: prediccionLabel = 'VERDADERO' return prediccionLabel, prediccion[1][0] * 100
def task9(task6model): querytask9_0 = spark.sql( "SELECT id,comment_timestamp,title,state,comment_body FROM task8_table WHERE comment_body NOT LIKE '>%' AND comment_body NOT LIKE '%/s%'" ) querytask9_0.write.saveAsTable("task9_table1") querytask9_1 = spark.sql( "SELECT id, connect_all_string(sanitize(comment_body)) AS n_grams, comment_timestamp,title,state,comment_body FROM task9_table1" ) querytask9_2 = querytask9_1.select( split(col("n_grams"), ",\s*").alias("n_grams"), col("id"), col("comment_timestamp"), col("title"), col("state"), col("comment_body")) task9df = task6model.transform(querytask9_2) task9df.printSchema() task9df = task9df.write.saveAsTable("task9_table2") querytask9_3 = spark.sql( "SELECT id, n_grams, comment_timestamp,title,state,comment_body, features, features AS features_backup FROM task9_table2" ) model_pos = CrossValidatorModel.load("www/pos.model") model_neg = CrossValidatorModel.load("www/neg.model") pos_ans = model_pos.transform(querytask9_3).write.saveAsTable("pos_table") task9df_withPos = spark.sql( "SELECT id,comment_timestamp,title,state,comment_body,prediction AS pos, features_backup AS features, probability AS pos_probability FROM pos_table" ) task9df_withPos.show() neg_ans = model_neg.transform(task9df_withPos).write.saveAsTable( "neg_table") task9result = spark.sql( "SELECT id,comment_timestamp,title,state,comment_body, pos , prediction AS neg FROM neg_table" ) task9result.write.parquet("task9result_parquet") #store parquet final_task9result = spark.read.parquet("task9result_parquet") final_task9result.write.saveAsTable("task9_table") spark.sql("SELECT * FROM task9_table").show()
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) metrics = [0.0] * numModels # Use rolling window instead of random folds rowNumCol = self.uid + "_rownum" w = Window().orderBy(lit('A')) # Dummy window to create row number df = dataset.select("*", row_number().over(w).alias(rowNumCol)) h = df.count() / (nFolds + 1) pool = ThreadPool(processes=self.getParallelism()) subModels = None collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [[None for j in range(numModels)] for i in range(nFolds)] for i in range(nFolds): # Get rolling (increasing) window validateLB = (i + 1) * h validateUB = (i + 2) * h validation = df.filter((df[rowNumCol] >= validateLB) & (df[rowNumCol] < validateUB)).cache() train = df.filter(df[rowNumCol] < validateLB).cache() tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) for j, metric, subModel in pool.imap_unordered( lambda f: f(), tasks): metrics[j] += (metric / nFolds) if collectSubModelsParam: subModels[i][j] = subModel validation.unpersist() train.unpersist() bestIndex = np.argmax(metrics) #if eva.isLargerBetter(): #bestIndex = np.argmax(metrics) ##else: #bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues( CrossValidatorModel(bestModel, metrics, subModels))
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) folds = dataset.select('fold').distinct().collect() nFolds = len(folds) metrics = [0.0] * numModels pool = ThreadPool(processes=min(self.getParallelism(), numModels)) subModels = None allMetrics = [[None for i in range(nFolds)] for j in range(numModels)] collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [[None for j in range(numModels)] for i in range(nFolds)] for i in range(nFolds): validation = dataset.filter((dataset['fold'] == i) & (dataset['test'] == 1)).cache() train = dataset.filter((dataset['fold'] == i) & (dataset['test'] == 0)).cache() tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) for j, metric, subModel in pool.imap(lambda f: f(), tasks): allMetrics[j][i] = metric metrics[j] += (metric / nFolds) if collectSubModelsParam: subModels[i][j] = subModel validation.unpersist() train.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return (self._copyValues( CrossValidatorModel(bestModel, metrics, subModels)), allMetrics)
def loadModel(conf, path): """ input : conf [dictionary], path [string] output: model [CrossValidatorModel, TrainValidationSplitModel, GeneralizedLinearRegressionModel] """ if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": loading_model = CrossValidatorModel.load(path) elif conf["tuning"].get("method").lower() == "trainvalsplit": loading_model = TrainValidationSplitModel.load(path) elif conf["tuning"] == None: loading_model = GeneralizedLinearRegressionModel.load(path) return loading_model
def loadaftRegression(conf, path): '''Loading model from path. Input : - Path Output : - Loaded model ''' # Load model if use crossvalidation tuning if conf["tuning"].get("method") == "crossval" : loaded_model = CrossValidatorModel.load(path) # Load model if use trainvalidationsplit tuning elif conf["tuning"].get("method") == "trainval": loaded_model = TrainValidationSplitModel.load(path) # Load model if non-tuning elif conf["tuning"].get("method") == None: loaded_model = AFTSurvivalRegressionModel.load(path) return loaded_model
def _fit(self, dataset): current_estimator = self.getEstimator() # Not a Pipeline, use standard CrossValidator if not isinstance(current_estimator, Pipeline): return super(DagCrossValidator, self)._fit(dataset) # Delegate parallelism to DagPipeline elif not isinstance(current_estimator, DagPipeline): dag_pipeline = DagPipeline(stages=current_estimator.getStages(), parallelism=self.getParallelism()) # Already a DagPipeline else: dag_pipeline = current_estimator epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition).cache() train = df.filter(~condition).cache() fold_metrics = dag_pipeline.evaluate(epm, train, validation, eva) for j in range(len(metrics)): metrics[j] += fold_metrics[j] / nFolds validation.unpersist() train.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = current_estimator.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics))
def make_and_save_predictions(input_df, config): print("*" * 10) dt = input_df.select("refresh_date").first()[0] print("*" * 10) model = CrossValidatorModel.load(config["model_path"]) final_output_df = model.transform(input_df).withColumnRenamed( "id", "JobID").withColumnRenamed("prediction", "JobDurationML") print(final_output_df.printSchema) prediction_path = config["prediction_path"] + "/year=" + str( dt.year) + "/month=" + str(dt.month) + "/day=" + str(dt.day) final_output_df.select( config["categorical_cols"] + config["numerical_cols"] + ["JobID", "JobDurationML"]).coalesce(1).write.mode('overwrite').option( "header", 'true').csv(prediction_path)
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels dfp = df.toPandas() dfp = np.array_split(dfp, nFolds) train = self.spark.createDataFrame(data=dfp[0].round(3)) for i in range(1, len(dfp) - 1): p = self.spark.createDataFrame(data=dfp[i].round(3)) train = train.union(p) validation = self.spark.createDataFrame(data=dfp[-1].round(3)) validation = validation.sort(validation.id.asc()) train = train.sort(train.id.asc()) for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition) validation = validation.sort(validation.id.asc()) validation.show() train = df.filter(~condition) train = train.sort(train.id.asc()) train.show() models = est.fit(train, epm) for j in range(numModels): model = models[j] # TODO: duplicate evaluator to take extra params from input metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric / nFolds if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics))
def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(100) lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), grid) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) originalParamMap = cv.getEstimatorParamMaps() loadedParamMap = loadedCV.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_save_load_nested_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) originalParamMap = cv.getEstimatorParamMaps() loadedParamMap = loadedCV.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)