def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_default_read_write(self): temp_path = tempfile.mkdtemp() lr = LogisticRegression() lr.setMaxIter(50) lr.setThreshold(.75) writer = DefaultParamsWriter(lr) savePath = temp_path + "/lr" writer.save(savePath) reader = DefaultParamsReadable.read() lr2 = reader.load(savePath) self.assertEqual(lr.uid, lr2.uid) self.assertEqual(lr.extractParamMap(), lr2.extractParamMap()) # test overwrite lr.setThreshold(.8) writer.overwrite().save(savePath) reader = DefaultParamsReadable.read() lr3 = reader.load(savePath) self.assertEqual(lr.uid, lr3.uid) self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
def test_binomial_logistic_regression_with_bound(self): df = self.spark.createDataFrame( [(1.0, 1.0, Vectors.dense(0.0, 5.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0)), (1.0, 3.0, Vectors.dense(2.0, 1.0)), (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) lor = LogisticRegression(regParam=0.01, weightCol="weight", lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), upperBoundsOnIntercepts=Vectors.dense(0.0)) model = lor.fit(df) self.assertTrue( np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
def test_multinomial_logistic_regression_with_bound(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) lor = LogisticRegression(regParam=0.01, lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) model = lor.fit(df) expected = [[4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0], [3.041, 5.0, 8.0, 11.0]] for i in range(0, len(expected)): self.assertTrue( np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) self.assertTrue( np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
def test_logistic_regression(self): lr = LogisticRegression(maxIter=1) path = tempfile.mkdtemp() lr_path = path + "/logreg" lr.save(lr_path) lr2 = LogisticRegression.load(lr_path) self.assertEqual(lr2.uid, lr2.maxIter.parent, "Loaded LogisticRegression instance uid (%s) " "did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent)) self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], "Loaded LogisticRegression instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_int_to_float(self): from pyspark.mllib.linalg import Vectors df = self.sc.parallelize([ Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF() lr = LogisticRegression(elasticNetParam=0) lr.fit(df) lr.setElasticNetParam(0) lr.fit(df)
def train(self, rdd): """ :return: Trained model to be passed to test. """ options = self.options if options.reg_type == "elastic-net": # use spark.ml lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param, elasticNetParam=options.elastic_net_param) # TODO: Do not include time for conversion to DataFrame (but this currently matches # the Scala tests) df = rdd.toDF() lrModel = lr.fit(df) numFeatures = len(lrModel.weights) numClasses = 2 return LogisticRegressionModel(lrModel.weights, lrModel.intercept, numFeatures, numClasses) else: if options.loss == "logistic": if options.optimizer == "sgd": return LogisticRegressionWithSGD.train(data=rdd, iterations=options.num_iterations, step=options.step_size, miniBatchFraction=1.0, regParam=options.reg_param, regType=options.reg_type) elif options.optimizer == "l-bfgs": return LogisticRegressionWithLBFGS.train(data=rdd, iterations=options.num_iterations, regParam=options.reg_param, regType=options.reg_type, tolerance=0.0) else: raise Exception("GLMClassificationTest cannot run with loss = %s," " optimizer = %s" % (options.loss, options.optimizer)) elif options.loss == "hinge": if options.optimizer == "sgd": return SVMWithSGD.train(data=rdd, iterations=options.num_iterations, step=options.step_size, regParam=options.reg_param, miniBatchFraction=1.0, regType=options.reg_type) else: raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
def buil_lrmodel(path): df = load_data(path) #-------------------- preparing the dataset ------------------------------------------- avg_age = find_avg_age(df) df = data_preparation(df, avg_age) print "count = " , df.count() df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') #------------------ Build a model ---------------------------------------------------- lr = LogisticRegression(maxIter=10, regParam=0.01) model = lr.fit(df) prediction = model.transform(df) prediction.show(truncate=False) evaluator = BinaryClassificationEvaluator() print "classification evaluation :" , evaluator.evaluate(prediction) #-------------- selecting models with cross validation ----------------------------------- lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\ .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def anom_with_lr(): try: prepared_data = split_data() train = prepared_data['train'] test = prepared_data['test'] for_finding_more = prepared_data['for_finding_more'] lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do #any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection. #With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points. t0 = time() model = lr.fit(train) tt = time() - t0 print "Classifier trained in {0} seconds".format(round(tt,3)) t0 = time() predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only tt = time() - t0 print "Prediction made in {0} seconds".format(round(tt,3)) #Adding proabability to test data set for calibration labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5))) labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression') test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size) fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count() print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float for_finding_more = for_finding_more.toDF(["label", "predicted_prob"]) for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc()) for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has #probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99 except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return
def test_multiclass_logistic_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], [])), (2.0, 2.0, Vectors.dense(2.0)), (2.0, 2.0, Vectors.dense(1.9))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertAlmostEqual(s.accuracy, 0.75, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) self.assertAlmostEqual(s.weightedRecall, 0.75, 2) self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
print('\tMin:', np.min(predictions_ar[:, 1])) print('\tMax:', np.max(predictions_ar[:, 1])) print('\tMean:', np.mean(predictions_ar[:, 1])) # In[146]: #----- LOGISTIC REGRESSION print() print() print('LOGISTIC REGRESSION') log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'label', weightCol = 'attrib_weights', maxIter = 10, regParam = 0.00, elasticNetParam = 0.0, standardization = True) logModel = log_reg.fit(train_data) # make predictions predicted = logModel.transform(validate_data) evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC") print('\tROC AUC score = ', evaluator.evaluate(predicted)) # In[ ]: #----- RANDOM FOREST # #print()
onehot_encoder = OneHotEncoderEstimator( inputCols=[ 'nb_pred_0', 'nb_pred_1', 'nb_pred_2', 'svm_pred_0', 'svm_pred_1', 'svm_pred_2', 'joint_pred_0', 'joint_pred_1', 'joint_pred_2' ], outputCols=['vec{}'.format(i) for i in range(9)]) vector_assembler = VectorAssembler( inputCols=['vec{}'.format(i) for i in range(9)], outputCol='meta_features') gen_meta_feature_pipeline = Pipeline(stages=[onehot_encoder, vector_assembler]) gen_meta_feature_pipeline_model = gen_meta_feature_pipeline.fit(meta_features) meta_features = gen_meta_feature_pipeline_model.transform(meta_features) # train the meta clasifier lr_model = LogisticRegression(featuresCol='meta_features', labelCol='label', predictionCol='final_prediction', maxIter=20, regParam=1., elasticNetParam=0) meta_classifier = lr_model.fit(meta_features) # task 1.3 pred_test = test_prediction(test_data, base_features_pipeline_model, gen_base_pred_pipeline_model, gen_meta_feature_pipeline_model, meta_classifier) # Evaluation evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", metricName='f1') print( evaluator.evaluate(pred_test, {evaluator.predictionCol: 'final_prediction'}))
# Option USE_SVM = True USE_LR = False USE_DT = False # Read Data sqlContext = SQLContext(sc) trainData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('flight/*.csv') testData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('test/*.csv') #Preprocess Data trainData = preprocess(trainData) testData = preprocess(testData) #Logistic Regression lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) lrModel = lr.fit(trainData) lrprediction = lrModel.transform(testData) lrselected = lrprediction.select("probability").first().probability[0] result="Logistic Regression Accuracy:"+str(lrselected)+'\n' #Decision Tree Regression dataset = trainData.unionAll(testData) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dataset) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers.
def test_string(self): lr = LogisticRegression() for col in ['features', u'features', np.str_('features')]: lr.setFeaturesCol(col) self.assertEqual(lr.getFeaturesCol(), 'features') self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
print("Usage: logistic_regression", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonLogisticRegressionExample") sqlContext = SQLContext(sc) # Load the data stored in LIBSVM format as a DataFrame. df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") si_model = stringIndexer.fit(df) td = si_model.transform(df) [training, test] = td.randomSplit([0.7, 0.3]) lr = LogisticRegression(maxIter=100, regParam=0.3).setLabelCol("indexedLabel") lr.setElasticNetParam(0.8) # Fit the model lrModel = lr.fit(training) predictionAndLabels = lrModel.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = MulticlassMetrics(predictionAndLabels) print("weighted f-measure %.3f" % metrics.weightedFMeasure()) print("precision %s" % metrics.precision()) print("recall %s" % metrics.recall()) sc.stop()
schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)]) features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) print "Features created" from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol()) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') lr_model = lr.fit(featIndexed) dfTestTok = tokenizer.transform(dfTest) featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) testIndexed = string_indexer_model.transform(featuresTest) df_test_pred = lr_model.transform(testIndexed) res=evaluator.evaluate(df_test_pred) print res
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("MulticlassLogisticRegressionWithElasticNet") \ .getOrCreate() # $example on$ # Load training data training = spark \ .read \ .format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for multinomial logistic regression print("Coefficients: \n" + str(lrModel.coefficientMatrix)) print("Intercept: " + str(lrModel.interceptVector)) trainingSummary = lrModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective)
def main(context): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED # TASK 1 # Load the data into PySpark. # For the comments: if not os.path.exists("./comments.parquet"): comments = context.read.json("comments-minimal.json.bz2") comments.write.parquet("comments.parquet") # For the submissions: if not os.path.exists("./submissions.parquet"): submissions = context.read.json("submissions.json.bz2") submissions.write.parquet("submissions.parquet") #submissions.printSchema() # For labelled data: if not os.path.exists("./labels.parquet"): labels = context.read.format('csv').options( header='true', inferSchema='true').load("labeled_data.csv") labels.write.parquet("labels.parquet") # TASK 2 # Code for Task 2... # For task 2, we will join the labels and comments commentsParquet = context.read.parquet("comments.parquet") commentsParquet.createOrReplaceTempView("comments") labelsParquet = context.read.parquet("labels.parquet") labelsParquet.createOrReplaceTempView("labels") # Now, compute the join: if not os.path.exists("./joinedComments.parquet"): joinedComments = context.sql( "SELECT labels.Input_id, labels.labeldem, labels.labelgop, labels.labeldjt, body FROM comments JOIN labels on id=Input_id" ) joinedComments.write.parquet("joinedComments.parquet") joinedComments = context.read.parquet("joinedComments.parquet") joinedComments.createOrReplaceTempView("joinedComments") #joinedComments.printSchema() # TASK 3 # NOT NEEDED # TASK 4 # Register the user defined function context.registerFunction("sanitize", clean_wrapper, ArrayType(StringType())) # TASK 5 if not os.path.exists("./santized.parquet"): sanitizedText = context.sql( "SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) as body FROM joinedComments" ) sanitizedText.write.parquet("sanitized.parquet") # TASK 6A sanitizedText = context.read.parquet("sanitized.parquet") sanitizedText.createOrReplaceTempView("sanitizedText") cv = CountVectorizer(inputCol="body", outputCol="features", minDF=10.0, binary=True) fitted = cv.fit(sanitizedText) vector = fitted.transform(sanitizedText) # TASK 6B vector.createOrReplaceTempView("vector") pos = context.sql("SELECT *, if(labeldjt=1, 1, 0) AS label FROM vector") neg = context.sql("SELECT *, if(labeldjt=-1, 1, 0) AS label FROM vector") # TASK 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # TASK 8 and TASK 9 # Create the submissions and comments tables from the parquets: if not os.path.exists("sanitizedJoinedData.parquet"): submissions = context.read.parquet("submissions.parquet") submissions.createOrReplaceTempView("submissions") comments = context.read.parquet("comments.parquet") comments.createOrReplaceTempView("comments") comments = comments.sample(False, 0.2, None) joinedData = context.sql( "SELECT comments.link_id AS id, comments.body, comments.created_utc, submissions.title, comments.author_flair_text, submissions.score AS submission_score, comments.score as comments_score FROM comments JOIN submissions ON REPLACE(comments.link_id, 't3_', '')=submissions.id AND comments.body NOT LIKE '%/s%' AND comments.body NOT LIKE '>%'" ) #joinedData.show(joinedData.count(), False) #print(str(joinedData.count())) # Repeating earlier tasks: Tasks 4 and 5 joinedData.createOrReplaceTempView("joinedData") # Re-register temporary function since we are forced to: context.registerFunction("sanitize", clean_wrapper, ArrayType(StringType())) print("writing sanitized parquet now") sanitizedJoinedData = context.sql( "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, sanitize(body) AS body FROM joinedData" ) sanitizedJoinedData.write.parquet("sanitizedJoinedData.parquet") sanitizedJoinedData = context.read.parquet("sanitizedJoinedData.parquet") sanitizedJoinedData = sanitizedJoinedData.sample(False, 0.2, None) cv = CountVectorizer(inputCol="body", outputCol="features", minDF=10.0, binary=True) newVector = fitted.transform(sanitizedJoinedData) seenPosModel = CrossValidatorModel.load("project2/pos.model") seenNegModel = CrossValidatorModel.load("project2/neg.model") posResult = seenPosModel.transform(newVector) posResult = posResult.selectExpr("id", "created_utc", "title", "author_flair_text", "submission_score", "comments_score", "body", "features", "probability as positive_probability") cumResult = seenNegModel.transform(posResult) cumResult = cumResult.selectExpr("id", "created_utc", "title", "author_flair_text", "submission_score", "comments_score", "body", "features", "positive_probability", "probability as negative_probability") cumResult.createOrReplaceTempView("cumResult") context.registerFunction("positiveFunc", positiveUDF, IntegerType()) context.registerFunction("negativeFunc", negativeUDF, IntegerType()) cumResult = context.sql( "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, body, features, positiveFunc(positive_probability) AS positive_probability,negativeFunc(negative_probability) AS negative_probability FROM cumResult" ) cumResult.write.parquet("cumResult.parquet") # TASK 10 cumResult = context.read.parquet("cumResult.parquet") cumResult.createOrReplaceTempView("cumResult") # Actual 10.2 task10_6 = context.sql( "SELECT DATE(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created ORDER BY date_created" ) task10_6.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_6.csv") # Top 10 posts: if not os.path.exists("./task10_top_pos.csv"): task10_top_pos = cumResult.groupBy('title')\ .agg( (F.sum('positive_probability') / F.count(F.lit(1))).alias('pct_pos'), F.count(F.lit(1)).alias('count') )\ .orderBy(F.desc('pct_pos'), F.desc('count')).limit(10)\ .select('title', 'pct_pos') task10_top_pos.repartition( 1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_top_pos.csv") if not os.path.exists("./task10_top_neg.csv"): task10_top_neg = cumResult.groupBy('title')\ .agg( (F.sum('negative_probability') / F.count(F.lit(1))).alias('pct_neg'), F.count(F.lit(1)).alias('count') )\ .orderBy(F.desc('pct_neg'), F.desc('count')).limit(10)\ .select('title', 'pct_neg') task10_top_neg.repartition( 1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_top_neg.csv") # 10.1 # Get the number of records totalRows = cumResult.count() # Calculate percentages task10_1 = context.sql( "SELECT SUM(positive_probability)/ {0} AS pos, SUM(negative_probability)/{1} AS neg FROM cumResult" .format(totalRows, totalRows)) # 10.2 task10_2 = context.sql( "SELECT DAYOFWEEK(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created" ) # 10.3 context.registerFunction("checkStateWrapper", checkState, BooleanType()) task10_3 = context.sql( "SELECT author_flair_text AS state, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult WHERE(checkStateWrapper(author_flair_text)) GROUP BY author_flair_text" ) # 10.4 task10_4 = context.sql( "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score" ) task10_5 = context.sql( "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score" ) # cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv") task10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_1.csv") task10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_2.csv") task10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_3.csv") task10_4.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_4.csv") task10_5.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_5.csv")
predictions.select("prediction", "rawPrediction", "probability", "indexedLabel").show(5) evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC") auc = evaluator.evaluate(predictions) predictions_rf = predictions logger.info("RandomForestClassifier AUC:" + str(auc)) from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=5, regParam=0.03) pipeline = Pipeline(stages=[labelIndexer, hasher, lr]) lrModel = pipeline.fit(trainingData) predictions = lrModel.transform(testData).cache() evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC") auc = evaluator.evaluate(predictions) predictions_lr = predictions logger.info("LogisticRegression AUC:" + str(auc)) from pyspark.ml.classification import GBTClassifier gbt = GBTClassifier(labelCol="indexedLabel",
# split back train/test data train = lf.where(lf.mark == 'train') test = lf.where(lf.mark == 'test') # random split further to get train/validate train, validate = train.randomSplit([0.7, 0.3], seed=121) print('Train Data Number of Row: ' + str(train.count())) print('Validate Data Number of Row: ' + str(validate.count())) print('Test Data Number of Row: ' + str(test.count())) # Apply Logsitic Regression from pyspark.ml.classification import LogisticRegression # regPara: regualrization parameter lr = LogisticRegression(maxIter=100, regParam=0.05, labelCol='index').fit(train) # Evaluate model based on auc ROC(default for binary classification) from pyspark.ml.evaluation import BinaryClassificationEvaluator def testModel(model, validate=validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol='index') return evaluator.evaluate(pred) print('****************************************************AUC ROC is' + str(testModel(lr))) from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
def titanic_classifier(filename='titanic.csv'): """ Implements a logistic regression to classify the Titanic dataset. Parameters: filename (str): path to the dataset Returns: lr_metrics (list): a list of metrics gauging the performance of the model ('f1', 'accuracy', 'weightedPrecision', 'weightedRecall') """ # start the SparkSession spark = SparkSession.builder\ .appName('Titanic Classifier')\ .getOrCreate() # load the data schema = ('survived INT, pclass INT, name STRING, sex STRING, ' 'age FLOAT, sibsp INT, parch INT, fare FLOAT') titanic = spark.read.csv('titanic.csv', schema=schema) # convert 'sex' column to numbers indexer = [ StringIndexer(inputCol='sex', outputCol='sex_index').fit(titanic) ] titanic = Pipeline(stages=indexer).fit(titanic)\ .transform(titanic) # drop 'name' and 'sex' column (no longer needed) titanic = titanic.drop('name', 'sex') # vectorize the features feature = VectorAssembler(inputCols=titanic.columns[1:], outputCol='features') feature_vector = feature.transform(titanic) # split test and train data train, test = feature_vector.randomSplit([0.8, 0.2], seed=42) # initialize logistic regression object lr = LogisticRegression(labelCol='survived', featuresCol='features') # train the model lr_model = lr.fit(train) # make predictions lr_preds = lr_model.transform(test) # obtain performance metrics metrics = ['f1', 'accuracy', 'weightedPrecision', 'weightedRecall'] lr_eval = MCE(labelCol='survived', predictionCol='prediction') lr_metrics = [ lr_eval.evaluate(lr_preds, {lr_eval.metricName: metric}) for metric in metrics ] # stop the SparkSession spark.stop() return lr_metrics
# Create train-ing dataset by joining labels and features train = featureDF.join(labels_df, featureDF.origin == labels_df.filePath).select( "features", "label", featureDF.origin) # Validate number of images used for training train.count() # COMMAND ---------- # DBTITLE 1,Train our Logistic Regression Model from pyspark.ml.classification import LogisticRegression # Fit LogisticRegression Model lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") lrModel = lr.fit(train) # COMMAND ---------- # DBTITLE 1,Generate Predictions on Test data from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel # Load Test Data featuresTestDF = spark.read.parquet(imgFeaturesTestPath) # Generate predictions on test data result = lrModel.transform(featuresTestDF) result.createOrReplaceTempView("result")
vecAssembler = VectorAssembler(inputCols=[ "sepsis_antibiotic", "antibiotic", "immunosupp_class3", "RACE_NUM", "ETH_NUM", "SEXNUM", "icd_ind", "icd_rank", "sepsis_glucocorticoid", "treatment_limit", "icd9_477_x", "icd9_493_x", "age_at_enc", "icd9_691_x", "temp", "biologicals", "icd9_995_3", "bmi", "pain_scale", "dnr", "dnr_treatment_limit", "staph", "immunosupp_medname", "dncpr_dni", "icd9_558_3", "albuterol", "avpu", "avpu_old", "dnr_dni", "immunosupp_class31" ], outputCol="features") # Split data (trainingData, testData) = sepsis.randomSplit([0.7, 0.3]) # Decision Tree Classifier lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline = Pipeline(stages=[labelIndexer, vecAssembler, lr]) # Fit the data model = pipeline.fit(trainingData) # Predict predictions = model.transform(testData) predictions.printSchema() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision") predictions.first() weightedPrecision = evaluator.evaluate(predictions) print "Model Weighted Precision: ", weightedPrecision
# Built Evaluator evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy") accuracy = evaluator.evaluate(predictions) predictions.show(5,False) print("Test set accuracy = " + str(accuracy)) # ###### Logistic Regression Classifier # In[348]: from pyspark.ml.classification import LogisticRegression blor = LogisticRegression(maxIter=5, regParam=0.01, regType='l1',featuresCol='scaledFeatures') # Ridge regression rlor = LogisticRegression(maxIter=5, regParam=0.01, regType='l2',featuresCol='scaledFeatures') model = blor.fit(train) help(blor) # In[349]: predictions_blor = model.transform(test) predictions_blor.show(20) # In[350]: evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
def buildModel(df): lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lr_model = lr.fit(df) return lr_model
def test_bool(self): self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1)) self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
# Fit the pipeline to training documents. pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) dataset.show(5) # COMMAND ---------- # splitting data in testing and training (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # COMMAND ---------- #applying logistic regression using the "Text" to predict "Sentiment" lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions)
def test_float(self): lr = LogisticRegression(tol=1) self.assertEqual(lr.getTol(), 1.0) self.assertTrue(type(lr.getTol()) == float) self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext(appName="ML Example") sc.setLogLevel("FATAL") sqlContext = SQLContext(sc) # Prepare training data from a list of (label, features) tuples. training = sqlContext.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters: ") print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20}
# Split data into train and test. train, test = data.randomSplit([0.75, 0.25], seed=123) print("********* TRAINING DATA ***********") print(train.limit(10).toPandas()) reg = 0.1 # Load Regularization Rate from argument if len(sys.argv) > 1: reg = float(sys.argv[1]) print("Regularization Rate is {}.".format(reg)) run_logger.log("Regularization Rate", reg) # create a new Logistic Regression model. lr = LogisticRegression(regParam=reg) # string-index and one-hot encode the education column si1 = StringIndexer(inputCol=' education', outputCol='ed') ohe1 = OneHotEncoder(inputCol='ed', outputCol='ed-encoded') # string-index and one-hot encode the matrial-status column si2 = StringIndexer(inputCol=' marital-status', outputCol='ms') ohe2 = OneHotEncoder(inputCol='ms', outputCol='ms-encoded') # string-index the label column into a column named "label" si3 = StringIndexer(inputCol=' income', outputCol='label') # assemble the encoded feature columns in to a column named "features" assembler = VectorAssembler( inputCols=['ed-encoded', 'ms-encoded', ' hours-per-week'],
def test_int(self): lr = LogisticRegression(maxIter=5.0) self.assertEqual(lr.getMaxIter(), 5) self.assertTrue(type(lr.getMaxIter()) == int) self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt")) self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
train_tf_vec = tf_vector.transform(train_df) test_tf_vec = tf_vector.transform(test_df) tfidf_vector = IDF(inputCol='tf_vector', outputCol='tfidf_vector') train_tfidf_vec = tfidf_vector.fit(train_tf_vec).transform(train_tf_vec) test_tfidf_vec = tfidf_vector.fit(test_tf_vec).transform(test_tf_vec) assembler = VectorAssembler(inputCols=['tfidf_vector', 'token_count'], outputCol='X') train_tfidf_vec = assembler.transform(train_tfidf_vec) test_tfidf_vec = assembler.transform(test_tfidf_vec) train_data, dev_data = train_tfidf_vec.randomSplit([0.95, 0.05]) model = LogisticRegression(featuresCol='X', labelCol='label').fit(train_data) result_dev = model.evaluate(dev_data).predictions result_test = model.evaluate(test_tfidf_vec).predictions result_test = result_test.withColumn('final', result_test.prediction.cast('int')) result_test.select("final").write.csv( path="file:///home/root/emailclass/sub_1.csv", header="false") auc_dev = BinaryClassificationEvaluator(labelCol='label').evaluate(result_dev) print(auc_dev)
# </font> # In[18]: df2_train = df1_train.select( [c for c in output.columns if c in {'features', 'label'}]) df2_test = df1_test.select( [c for c in output.columns if c in {'features', 'label'}]) # <font size=4,font style=arial> # Modelimizi train veri setine uygulayalım # </font> # In[19]: final_model = LogisticRegression() fit_final_model = final_model.fit(df2_train) # <font size=4,font style=arial> # Model Beta katsayıları aşağıdaki şekildedir # </font> # In[20]: print("Coefficients: " + str(fit_final_model.coefficients)) print("Intercept: " + str(fit_final_model.intercept)) # <font size=4,font style=arial> # Roc area and curve # </font>
finalSchema = StructType(fields=newDF) dataset = sqlContext.read.format('csv').options( header='true', schema=finalSchema, delimiter='|').load('/FileStore/tables/dataset.csv') #types = [f.dataType for f in dataset.schema.fields] #print(types) dataset = dataset.withColumn("label", dataset["label"].cast(DoubleType())) dataset = dataset.withColumn("id", dataset["id"].cast(IntegerType())) training, test = dataset.randomSplit([0.8, 0.2], seed=12345) #types = [f.dataType for f in training.schema.fields] #print(types) #exit() tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, nb]) # Fit the pipeline to training documents. model = pipeline.fit(training) result = model.transform(test)\ .select("features", "label", "prediction") correct = result.where(result["label"] == result["prediction"]) accuracy = correct.count() / test.count() print("Accuracy of model = " + str(accuracy)) test_error = 1 - accuracy print("Test error = " + str(test_error)) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
# Predictions are done by evaluating each binary classifier and the index of # the most confident classifier is output as label. spark = SparkSession.builder.appName("OneVsRest").getOrCreate() # Load data file. inputData = spark.read \ .format("libsvm") \ .load("sample_multiclass_classification_data.txt") # Generate the train/test split. train, test = inputData.randomSplit([0.8, 0.2]) # Instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # Instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # Train the multiclass model. ovrModel = ovr.fit(train) # Score the model on test data. predictions = ovrModel.transform(test) # Obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # Compute the classification error on test data. accuracy = evaluator.evaluate(predictions)
from pyspark.ml import Pipeline from pyspark.sql import SparkSession from sklearn.datasets import load_iris import mlflow spark = SparkSession.builder.getOrCreate() mlflow.pyspark.ml.autolog() df = load_iris(as_frame=True).frame.rename(columns={"target": "label"}) df = spark.createDataFrame(df) train, test = df.randomSplit([0.8, 0.2]) assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features") scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol="scaledFeatures") lor = LogisticRegression(maxIter=5, featuresCol=scaler.getOutputCol()) # Non-neseted pipeline pipeline = Pipeline(stages=[assembler, scaler, lor]) with mlflow.start_run(): pipeline_model = pipeline.fit(train) columns = ["features", "prediction"] pipeline_model.transform(test).select(columns).show() # Nested pipeline nested_pipeline = Pipeline(stages=[Pipeline(stages=[assembler, scaler]), lor]) with mlflow.start_run(): nested_pipeline_model = nested_pipeline.fit(train) nested_pipeline_model.transform(test).select(columns).show()
elastic_net_param = 0.1 """ for reg_param in RP: lr = LogisticRegression(maxIter = max_iter, regParam=reg_param,elasticNetParam = elastic_net_param,standardization = stand) lr = lr.fit(trainDF) validateDF_prob = add_probability(validateDF,lr,sc) print "======================" print "averaged log_loss: ", temp = log_loss(validateDF_prob) print temp if temp < Opt: Opt = temp reg_param_opt = reg_param elastic_net_param_opt = elastic_net_param """ elastic_net_param_opt = 5e-3 reg_param_opt = 1e-6 lr = LogisticRegression(maxIter = max_iter, regParam=reg_param_opt,elasticNetParam = elastic_net_param_opt,standardization = stand) lr = lr.fit(trainDF) predictions = add_probability(testDF,lr,sc).select("activity_id","outcome") predictions = predictions.join(leakageTest,"activity_id","left_outer").withColumnRenamed("outcome","p") predictions = predictions.withColumn("outcome", when( isNull(predictions.leak), predictions.leak).otherwise(predictions.p).alias("outcome")) predictions.show(5) predictions = predictions.select("activity_id","outome") predictions.toPandas().to_csv(datapath+"lr.csv",index = False) #predictions = predictions.select(predictions.probability.values) #predictions.show(3) #predictions = predictions.select("activity_id",predictions.outcome.getItem(1).alias("outcome"))
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features') stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df_remove) model = pipelineModel.transform(df_remove) input_data = model.rdd.map(lambda x: (x['newlabel'], DenseVector(x['features']))) df_train = sqlContext.createDataFrame(input_data, ['label', 'features']) train_data, test_data = df_train.randomSplit([.8, .2], seed=42) lr = LogisticRegression(labelCol='label', featuresCol='features', maxIter=10, regParam=0.3) linearModel = lr.fit(train_data) predictions = linearModel.transform(test_data) selected = predictions.select('label', 'prediction', 'probability') # evaluate the model cm = predictions.select('label', 'prediction') cm.filter(cm['label'] == cm['prediction']).count() / cm.count() # 0.~~~~
def test_default_read_write_default_params(self): lr = LogisticRegression() self.assertFalse(lr.isSet(lr.getParam("threshold"))) lr.setMaxIter(50) lr.setThreshold(.75) # `threshold` is set by user, default param `predictionCol` is not set by user. self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) writer = DefaultParamsWriter(lr) metadata = json.loads(writer._get_metadata_to_save(lr, self.sc)) self.assertTrue("defaultParamMap" in metadata) reader = DefaultParamsReadable.read() metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) reader.getAndSetParams(lr, loadedMetadata) self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) # manually create metadata without `defaultParamMap` section. del metadata['defaultParamMap'] metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. metadata['sparkVersion'] = '2.3.0' metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) reader.getAndSetParams(lr, loadedMetadata)
pipeline = Pipeline(stages=[ regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx ]) pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) # Split data into training and test datasets (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) # Time taken to preprocess the data preprocess = datetime.now() preprocess_time = preprocess - starttime # Build the models lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0) nb = NaiveBayes(smoothing=1) # Train models with Training Data lrModel = lr.fit(trainingData) nbModel = nb.fit(trainingData) # Time taken to train the data training = datetime.now() training_time = training - preprocess # Testing data predictions = lrModel.transform(testData) nbpreds = nbModel.transform(testData) # Time taken to test data
from logreg import collect_one with SparkController() as sc: data_path, npar = './data/a9a', 5 dataset = MLUtils.loadLibSVMFile(sc, data_path, minPartitions=npar).cache() local_data = Worker.from_rows(dataset.collect(), dense=False) n, d = local_data.n_samples, local_data.n_features print '#samples: {n}; #features: {d}'.format(n=n, d=d) print 'Baseline: training in single node mode...' prob = Executor(local_data, n, d, collect_one, logreg_local, cached=True, l2_reg=0.01) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f') print 'Spark ({} partitions): training using peregrine...'.format(npar) prob = logistic_regression(dataset, dense=False, l2_reg=0.01) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f') print 'Spark ({} partitions): training using mllib...'.format(npar) sqlContext = SQLContext(sc) lr = LogisticRegression(maxIter=300, regParam=0.02, elasticNetParam=0.5, fitIntercept=False) lr.fit(dataset.toDF().replace(-1, 0, 'label').cache()) print 'Spark/Tensorflow ({} partitions): training using peregrine...'.format(npar) prob = logistic_regression(dataset, l2_reg=0.01, tensorflow=True) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')
# getWeight = udf(getweight, returnType=DoubleType()) # trainData = trainData.withColumn("weight", getWeight(trainData['label'])) #%% inputFeat = [ 'age_range', 'gender', 'lognum', 'click', 'shoppingcart', 'purchase', 'favorite' ] df_assembler = VectorAssembler(inputCols=inputFeat, outputCol='features') featureIndexer = VectorIndexer(maxCategories=8).setInputCol('features'). \ setOutputCol('indexedFeatures') #逻辑回归 lr = LogisticRegression(labelCol='label',featuresCol='indexedFeatures',\ maxIter=100, regParam=0.1) #随机森林 rf = RandomForestClassifier(labelCol="label", featuresCol='indexedFeatures', subsamplingRate=0.382) Pipeline = Pipeline().setStages([df_assembler, featureIndexer, lr]) PipelineModel = Pipeline.fit(trainData) Predictions = PipelineModel.transform(testData) # %%
def test_invalid_to_float(self): from pyspark.mllib.linalg import Vectors self.assertRaises(Exception, lambda: LogisticRegression(elasticNetParam="happy")) lr = LogisticRegression(elasticNetParam=0) self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda"))
# Classification Evaluator print(bcolors.OKBLUE + bcolors.BOLD + 'Creating Evaluator' + bcolors.ENDC) from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol='FinalPriority') pprint(evaluator.extractParamMap()) # Logistic Regression - cross validation print(bcolors.OKBLUE + bcolors.BOLD + 'Starting Logistic Regression CV with 3x3x3 parameters' + bcolors.ENDC) from pyspark.ml.classification import LogisticRegression from pyspark.ml.tuning import ParamGridBuilder, CrossValidator start = time.time() lr = LogisticRegression(featuresCol='features', labelCol='FinalPriority') # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5, 2.0]).addGrid( lr.elasticNetParam, [0.0, 0.5, 1.0]).addGrid(lr.maxIter, [1, 5, 10]).build()) cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) # Run cross validations cv.setParallelism(7) cvModel = cv.fit(train) print('Time for training : {} sec'.format(time.time() - start)) # predict and evaludate predictions = cvModel.transform(test)
output = assembler.transform(data) print(output.columns) #Seleccionamos los campos requeridos y la definimos los porcentajes de uso de data de entrenamiento y prueba final_data = output.select('features','churn') train, test = final_data.randomSplit([0.7, 0.3]) # COMMAND ---------- #Generamos la logistica de regresión lr = LogisticRegression(labelCol='churn') lr_model = lr.fit(train) train_summary = lr_model.summary train_summary.predictions.describe().show() roc = train_summary.roc.toPandas() plt.plot(roc['FPR'],roc['TPR']) plt.ylabel('False Positive Rate') plt.xlabel('True Positive Rate') plt.title('ROC Curve') plt.show() # COMMAND ---------- #Evaluamos el modelo con la data de prueba
indexes = [StringIndexer(inputCol = column, outputCol = column + '_index').fit(titanic_df) for column in ['Sex', 'Embarked', 'Initial']] pipeline = Pipeline(stages = indexes) titanic_df = pipeline.fit(titanic_df).transform(titanic_df) titanic_df.show(3) titanic_df = titanic_df.drop('PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex', 'Initial') titanic_df.show(5) feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features") feature_vector= feature.transform(titanic_df) feature_vector.show(5) #Run a simple Naive Bayes algorithm: #split the training and test set: (trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11) from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol = 'Survived', featuresCol = 'features') lr_model = lr.fit(training_data) lr_prediction = lr_model.transform(test_data) lr_prediction.select('prediction', 'Survived', 'features').show() #Performing the ml tuning: evaluator = MulticlassClassificationEvaluator(labelCol = 'Survived',) #Performing feature engineering by apache spark: from pyspark.sql.functions import avg bureau = spark.read.csv('bureau.csv', header = 'True', inferSchema = 'True') #display(bureau.where('SK_ID_CURR = 100001'))
# logistic model with binary dependent variable from pyspark.ml.classification import LogisticRegression # In[4]: # Load training data training = spark.read.format("libsvm").load("sample_libsvm_data.txt") # In[5]: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # In[6]: # Fit the model lrModel = lr.fit(training) # In[7]: # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept))
# tokenizer = Tokenizer(inputCol="text", outputCol="words") # hashtf = HashingTF(numFeatures=2**10, inputCol="words", outputCol='tf') # idf = IDF(inputCol='tf', outputCol="features") #minDocFreq: remove sparse terms # label_stringIdx = StringIndexer(inputCol = "tar", outputCol = "label") # pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx]) pipelineFit = pipeline.fit(df) train_df = pipelineFit.transform(df) (train_set, test_set, final_testset) = train_df.randomSplit([0.8, 0.1, 0.1], seed = 1235) #Logistic Regression from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator lr = LogisticRegression(maxIter=250) lrModel = lr.fit(train_set) #predictions on training predictions = lrModel.transform(train_set) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") train_logistic = evaluator.evaluate(predictions) #predictions on testing predictions = lrModel.transform(test_set) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") test_logistic = evaluator.evaluate(predictions) #Naive Bayes from pyspark.ml.classification import NaiveBayes
.appName("SimpleParamsExample") \ .getOrCreate() # prepare training data. # We create an RDD of LabeledPoints and convert them into a DataFrame. # A LabeledPoint is an Object with two fields named label and features # and Spark SQL identifies these fields and creates the schema appropriately. training = spark.createDataFrame([ Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])), Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])), Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])), Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))]) # Create a LogisticRegression instance with maxIter = 10. # This instance is an Estimator. lr = LogisticRegression(maxIter=10) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # We may also set parameters using setter methods. lr.setRegParam(0.01) # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a Transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters:\n") pprint.pprint(model1.extractParamMap())
# Create an assembler object assembler = VectorAssembler(inputCols=["mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration"], outputCol='features') # Consolidate predictor columns flights_assembled = assembler.transform(flights) # Check the resulting column flights = flights_assembled.select('features', 'xdelay') # Split into training and testing sets in a 80:20 ratio flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=23) # Create a classifier object and fit to the training data tree = LogisticRegression(labelCol="xdelay") tree_model = tree.fit(flights_train) # Create predictions for the testing data and take a look at the predictions prediction = tree_model.transform(flights_test) predictions = prediction.select('xdelay', 'prediction', 'probability') print(predictions.toPandas().sample(12)) print() # Create a confusion matrix confusion_matrix = prediction.groupBy("xdelay", 'prediction').count() confusion_matrix.show() ''' # Calculate the elements of the confusion matrix TrueNeg = prediction.filter('prediction = 0 AND xdelay = prediction').count()
tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[18]: from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator print "Fitting the classifier on selected features" t0 = time() string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') string_indexer_model = string_indexer.fit(dfTrainSelect) dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[19]: print "Testing precision of the model" t0 = time()
test = spark.createDataFrame([ (169.4, 75.3, 42), (185.1, 85.0, 37), (161.6, 61.2, 28)]).toDF("height", "weight", "age") training.show(truncate=False) assembler = VectorAssembler(inputCols=["height", "weight", "age"], outputCol="features") # training 데이터에 features 컬럼 추가 assembled_training = assembler.transform(training) assembled_training.show(truncate=False) # 모델 생성 알고리즘 (로지스틱 회귀 평가자) lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender") # 모델 생성 model = lr.fit(assembled_training) # 예측값 생성 model.transform(assembled_training).show() # 파이프라인 pipeline = Pipeline(stages=[assembler, lr]) # 파이프라인 모델 생성 pipelineModel = pipeline.fit(training) # 파이프라인 모델을 이용한 예측값 생성 pipelineModel.transform(training).show()
#input rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\ map(lambda x:x.split(",")) D = 2 ** 24 def helper1(r): features=[] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D) target = float(r[-1]) ID=float(r[0]) return target, Vectors.dense(features) except: return (0.0,[0.0]*1932) new_rdd = rdd.filter(lambda i : len(i)==1934) rdd_after_trans = new_rdd.map(helper1) rdd_after_trans.cache() df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"]) pca = PCA(k=1000, inputCol="features", outputCol="pca_features") model_pca = pca.fit(df) rdd_pca = model_pca.transform(df).select(["label","pca_features"]) rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features') (trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3]) lr = LogisticRegression(maxIter=100, regParam=0.01) model = lr.fit(trainingData) result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0])) result.saveAsTextFile("/user/demo/lr_pca_1000_001")
# $example on$ from pyspark.ml.classification import LogisticRegression # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("LogisticRegressionSummary") \ .getOrCreate() # Load training data training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # $example on$ # Extract the summary from the returned LogisticRegressionModel instance trained # in the earlier example trainingSummary = lrModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective)
# COMMAND ---------- fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() # COMMAND ---------- train, test = preparedDF.randomSplit([0.7, 0.3]) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="label",featuresCol="features") # COMMAND ---------- print lr.explainParams() # COMMAND ---------- fittedLR = lr.fit(train) # COMMAND ---------- train, test = df.randomSplit([0.7, 0.3])
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("EstimatorTransformerParamExample")\ .getOrCreate() # $example on$ # Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters: ") print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20}
tt = time() - t0 print "Done in {} second".format(round(tt, 3)) # In[18]: from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator print "Fitting the classifier on bigram features" t0 = time() string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') lr = LogisticRegression(featuresCol='bigramVectors', labelCol='target_indexed', maxIter=30, regParam=0.01) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') string_indexer_model = string_indexer.fit(dfBigram) dfTrainIndexed = string_indexer_model.transform(dfBigram).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt, 3)) # In[19]: print "Testing precision of the model"
# MAGIC %md # MAGIC ####Logistic Regression # MAGIC # MAGIC You can read more about Logistic Regression from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-linear-methods.html#logistic-regression). In the new Pipelines API, we are now able to perform Elastic net regularization with Logistic Regression, as well as other linear methods. # MAGIC # MAGIC # MAGIC Note: As of Spark 1.5.0, The Python API does not yet support multiclass classification for Logistic Regression, but will be available in future. # COMMAND ---------- from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params # Create initial LogisticRegression model lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # Train model with Training Data lrModel = lr.fit(trainingData) # COMMAND ---------- # Make predictions on test data using the Transformer.transform() method. # LogisticRegression.transform() will only use the 'features' column. predictions = lrModel.transform(testData) # COMMAND ---------- predictions.printSchema() # COMMAND ----------
nb = NaiveBayes() # Fit model naive_model = nb.fit(train_clean_data) # Evaluate the model test_results = naive_model.transform(test_clean_data) acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("Accuracy of model at predicting spam was: {}".format(acc)) print("-- Execution time: %s seconds ---" % (time.time() - start_time)) ######## Logistic Regression ###### start_time = time.time() # Setup Model log_reg = LogisticRegression() log_model = log_reg.fit(train_clean_data) # Evaluate the model test_results = log_model.transform(test_clean_data) acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("Accuracy of model at predicting spam was: {}".format(acc)) print("-- Execution time: %s seconds ---" % (time.time() - start_time)) # Random RandomForest start_time = time.time() rfc = RandomForestClassifier() # Train model. This also runs the indexers.