def test_int_to_float(self): from pyspark.mllib.linalg import Vectors df = self.sc.parallelize([ Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF() lr = LogisticRegression(elasticNetParam=0) lr.fit(df) lr.setElasticNetParam(0) lr.fit(df)
def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_binomial_logistic_regression_with_bound(self): df = self.spark.createDataFrame( [(1.0, 1.0, Vectors.dense(0.0, 5.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0)), (1.0, 3.0, Vectors.dense(2.0, 1.0)), (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) lor = LogisticRegression(regParam=0.01, weightCol="weight", lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), upperBoundsOnIntercepts=Vectors.dense(0.0)) model = lor.fit(df) self.assertTrue( np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
def test_multinomial_logistic_regression_with_bound(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) lor = LogisticRegression(regParam=0.01, lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) model = lor.fit(df) expected = [[4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0], [3.041, 5.0, 8.0, 11.0]] for i in range(0, len(expected)): self.assertTrue( np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) self.assertTrue( np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
def test_binary_logistic_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) self.assertAlmostEqual(s.accuracy, 1.0, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) self.assertAlmostEqual(s.weightedRecall, 1.0, 2) self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary)) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def main(): df = spark.read.json(katkam_in_directory) schema_file = open('schema') schema_lines = [i.strip() for i in schema_file.readlines()] schema = types.StructType([types.StructField(i, types.StringType(), False) for i in schema_lines]) schema_file.close() weather = spark.read.csv(weather_in_directory, schema=schema)#.withColumn('filename', functions.input_file_name()) df = df.join(weather, 'Date/Time') # https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe to_vec = functions.UserDefinedFunction(lambda vs: Vectors.dense(vs), VectorUDT()) get_rid_of_rain = functions.UserDefinedFunction(lambda vs: rain_gone(vs), types.LongType()) df = df.select(get_rid_of_rain(df['Weather']).alias('label'), to_vec(df['image']).alias('features')) # Do machine learning splits = df.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # Naive Bayes Model #nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # Logistic Regression Model lr = LogisticRegression() model = lr.fit(train) predictions = model.transform(test) # Compute accuracy on the test set evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) # Write the final predictions dataframe to a CSV directory predictions.write.json(out_directory, mode='overwrite') # Write the final accuracy score to a text file, tide analysis will write to the same file with open(out_directory + '/final-results.txt', 'w+') as fp: fp.write('Test set accuracy for weather analysis: ' + str(accuracy)) fp.close()
def downstream_ml_func(features_df, results_dict, layer_index): """ Sample implementation fo the downstream ML function :param features_df: Merged (struct+cnn) feature DataFrame :param results_dict: Dictionary object which is used to store downstream ML model performance details such as accuracy. :param layer_index: Layer index of the CNN of which the current features_df correspond to. The layer index is negative pointing the index from the top of the CNN layers :return: Dictionary """ lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.5) model = lr.fit(features_df) predictions = model.transform(features_df) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") results_dict[layer_index] = evaluator.evaluate(predictions) return results_dict
def logistic_classifier(df, conf): max_iter = conf["params"].get("maxIter") reg_param = conf["params"].get("regParam") elasticNetParam = conf["params"].get("elasticNetParam") family = conf["params"].get("family") weight = conf["params"].get("weightCol") lr = LogisticRegression(maxIter=max_iter, regParam=reg_param, weightCol=weight) if conf["tuning"].get("crossval"): grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) model = cv.fit(dataset) else: mlor = LogisticRegression(regParam=reg_param, weightCol=weight) model = mlor.fit(df) return model
def Logistic(): # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. lrModel = lr.fit(train) lrModel.write().overwrite().save("save/bert_logistic") # Make predictions on test data using the Transformer.transform() method. # LogisticRegression.transform will only use the 'features' column. # Note that model2.transform() outputs a "myProbability" column instead of the usual # 'probability' column since we renamed the lr.probabilityCol parameter previously. predictions = lrModel.transform(test) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Accuracy = %g " % accuracy)
def test_model_logistic_regression_binary_class(self): import inspect import os this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # self.spark.udf.register( "truncateFeatures", lambda x: SparseVector(5, range(0, 5), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr( "label", "truncateFeatures(features) as features") lr = LogisticRegression(maxIter=100, tol=0.0001) model = lr.fit(data) # the name of the input for Logistic Regression is 'features' model_onnx = convert_sparkml( model, 'sparkml logistic regression', [('features', FloatTensorType([1, model.numFeatures]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model import pandas predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] dump_data_and_sparkml_model(data_np, expected, model, model_onnx, basename="SparkmlLogisticRegression")
def run(data, headers_feature): print('-*-*-*- Iniciando la regresion logistica -*-*-*-') start_time = datetime.datetime.now() print('Tiempo inicial', start_time) # Obtener data de entrenamiento y pruebas train_data, test_data = prepare_dataset(data, headers_feature) # Configurar la regresion logistica multinomial lr = LogisticRegression(maxIter=200, regParam=0.3, elasticNetParam=0.8, labelCol='genre', family='multinomial') # Obtener el modelo de clasificacion lr_model = lr.fit(train_data) print("Coeficientes: " + str(lr_model.coefficientMatrix)) print("Intercepto: " + str(lr_model.interceptVector)) data_to_validate = lr_model.transform(test_data) evaluator1 = BinaryClassificationEvaluator( labelCol='genre', metricName='areaUnderROC', rawPredictionCol='rawPrediction') ROC = evaluator1.evaluate(data_to_validate) print("{}:{}".format("ROC", ROC)) evaluator2 = BinaryClassificationEvaluator( labelCol='genre', metricName='areaUnderPR', rawPredictionCol='rawPrediction') PR = evaluator2.evaluate(data_to_validate) print("{}:{}".format("PR", PR)) end_time = datetime.datetime.now() print('Tiempo final', end_time) print('Tiempo transcurrido para regresion logistica', end_time - start_time)
def do_machine_learning(enterteinment_num, politics_num): _fp = os.path.join(os.getcwd() + '/result_articles.txt') #크롤링을 통해 얻은 트레이닝 데이터를 불러옴. _f = open(_fp, 'r') frame = [] while True: line = _f.readline() if not line: break temp = [] for node in line.split(): temp.append(node) frame.append(temp) print(temp) _f.close() ############### df = spark.createDataFrame(frame, ['cls', 'entertainment', 'politics']) clsIndexer = StringIndexer(inputCol="cls", outputCol="label") i1Indexer = StringIndexer(inputCol="entertainment", outputCol="i1") i2Indexer = StringIndexer(inputCol="politics", outputCol="i2") va = VectorAssembler(inputCols=["i1", "i2"], outputCol="features") pipeline = Pipeline(stages=[clsIndexer, i1Indexer, i2Indexer, va]) model = pipeline.fit(df) df2 = model.transform(df) df2.printSchema() df2.show() trainDf = df2.select('label', 'features') lr = LogisticRegression(maxIter=10, regParam=0.01) lrModel = lr.fit(trainDf) #트레이닝 데이터 생성 #print Vectors.dense(10,10) test0 = spark.sparkContext.parallelize([ Row(features=Vectors.dense([enterteinment_num, politics_num])) ]).toDF() #테스트 데이터 입력 result = lrModel.transform(test0).head() print "Irregularity ? : ", result.prediction
def exec_logistic_regression(self, featuresCol1="features", labelCol1="label", predictionCol1="prediction", maxIter1=30, regParam1=0.3, elasticNetParam1=0, numClass1=2): ''' Creates the Logistic Regression model Pipeline Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name model parameters: {max iterations, regularization parameter, elastic net parameter}, numClass1: number of class labels Output: None ''' #Initialize Logistic Regression Model with parameters passed lr = LogisticRegression(featuresCol=featuresCol1, labelCol=labelCol1, predictionCol=predictionCol1, maxIter=maxIter1, regParam=regParam1, elasticNetParam=elasticNetParam1) #Fit lr model with training data lrModel = lr.fit(self.trainingData) #Make lr model predictions on testData predictions = lrModel.transform(self.testData) #Evaluate the results generated by the model prediction self.model_evaluator(predictions, modelType="Logistic Regression Model", modelParams=str({ 'maxIter': maxIter1, 'regParam': regParam1, 'elasticNetParam': elasticNetParam1 }), numClass=numClass1)
def train(self, rdd): """ :return: Trained model to be passed to test. """ options = self.options if options.reg_type == "elastic-net": # use spark.ml lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param, elasticNetParam=options.elastic_net_param) # TODO: Do not include time for conversion to DataFrame (but this currently matches # the Scala tests) df = rdd.toDF() lrModel = lr.fit(df) numFeatures = len(lrModel.weights) numClasses = 2 return LogisticRegressionModel(lrModel.weights, lrModel.intercept, numFeatures, numClasses) else: if options.loss == "logistic": if options.optimizer == "sgd": return LogisticRegressionWithSGD.train(data=rdd, iterations=options.num_iterations, step=options.step_size, miniBatchFraction=1.0, regParam=options.reg_param, regType=options.reg_type) elif options.optimizer == "l-bfgs": return LogisticRegressionWithLBFGS.train(data=rdd, iterations=options.num_iterations, regParam=options.reg_param, regType=options.reg_type, tolerance=0.0) else: raise Exception("GLMClassificationTest cannot run with loss = %s," " optimizer = %s" % (options.loss, options.optimizer)) elif options.loss == "hinge": if options.optimizer == "sgd": return SVMWithSGD.train(data=rdd, iterations=options.num_iterations, step=options.step_size, regParam=options.reg_param, miniBatchFraction=1.0, regType=options.reg_type) else: raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
def modelTraining(trainSetWoeDF, weightBalance, fn): # 数据预转换,满足ML-linearRegression输入格式要求 trainSetVecAse = vecAseembler.transform(trainSetWoeDF) strInd = stringIndexer.fit(trainSetVecAse) trainSetVecAseStrInd = strInd.transform(trainSetVecAse) trainSetVecAseStrIndWet = trainSetVecAseStrInd.withColumn( 'weight', trainSetVecAseStrInd.target * weightBalance + 1) # 模型训练 lrm = LogisticRegression(regParam=0.01, weightCol="weight") lrModel = lrm.fit(trainSetVecAseStrIndWet) trainSetWithProba = lrModel.transform(trainSetVecAseStrIndWet) # 保存模型及相关参数 vecAseembler.write().overwrite().save( savePath + '{}/{}/vecAseembler'.format(curDate, fn)) strInd.write().overwrite().save(savePath + '{}/{}/strInd'.format(curDate, fn)) lrModel.write().overwrite().save(savePath + '{}/{}/lrModel'.format(curDate, fn)) # joblib.dump([lr_model.intercept, lr_model.coefficients], localPath + 'params/lrFinalCoef_{}.pkl'.format(fn)) coefNotNegtive = np.where(lrModel.coefficients.toArray() > 0)[0] return (trainSetWithProba, coefNotNegtive)
def test_multinomial_logistic_regression_with_bound(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) lor = LogisticRegression( regParam=0.01, lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) model = lor.fit(df) expected = [[4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0], [3.041, 5.0, 8.0, 11.0]] for i in range(0, len(expected)): self.assertTrue( np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) self.assertTrue( np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
def modeloLogistico(data, labelCol="label", featuresCol="features", weightCol="classWeights"): """ Función que se encarga de ajustar un modelo logístico a partir de un dataframe de spark con el esquema ya procesado a partir de la función dataProcessing(). :param data: spark dataframe. :param labelCol: string nombre de la columna con la variable respuesta. :param featuresCol: string nombre de la columna con los vectores de las covariables. :returns modelo ajustado: """ model = LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, weightCol=weightCol) return model.fit(data)
def test_binomial_logistic_regression_with_bound(self): df = self.spark.createDataFrame( [ (1.0, 1.0, Vectors.dense(0.0, 5.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0)), (1.0, 3.0, Vectors.dense(2.0, 1.0)), (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"], ) lor = LogisticRegression( regParam=0.01, weightCol="weight", lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), upperBoundsOnIntercepts=Vectors.dense(0.0), ) model = lor.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1e-4)) self.assertTrue(np.isclose(model.intercept, 0.0, atol=1e-4))
def retrain_full_model(data, model_type, paramMap): ''' This function takes the whole dataset and retrains the given model with best parameters. Arguments: data {PySpark Dataframe} -- A PySpark Dataframe containing feature vectors and labels paramMap {dict} -- A dictionary of the best parameter values model_type {str} -- The type of model to train Returns: model -- Returns the model retrained on full dataset ''' if model_type == 'logistic': lr = LogisticRegression() model = lr.fit(data, paramMap) elif model_type == 'decisiontree': dt = DecisionTreeClassifier() model = dt.fit(data, paramMap) return model
def lr(ss, data, label_index, feature_indexs, project_url): # 1.构造训练数据集 def func(x): features_data = [] for feature in feature_indexs: if (is_number(x[feature])): features_data.append(float(x[feature])) else: features_data.append(0.0) label_data = 0.0 if (is_number(x[label_index])): label_data = float(x[label_index]) return Row(label=label_data, features=Vectors.dense(features_data)) training_set = data.rdd.map(list).map(lambda x: func(x)).toDF() # 2.训练模型 lr_param = LogisticRegression(regParam=0.01, family='multinomial') lr_model = lr_param.fit(training_set) print(lr_model.coefficientMatrix) # 系数 print(lr_model.interceptVector) # 截距 # print(lr_model.explainParams()) # 参数以及其注解 # 3.保存模型 # model_path = project_url + '/model/multipleClassification/lr' # lr_model.write().overwrite().save(model_path) # # # 4.读取模型 # lr2 = lr_model.load(model_path) # 5.预测 result = lr_model.transform(training_set).head() print(result.prediction) LogisticRegressionTrainingSummary sum = lr_model.summary # 6.评估 summary = lr_model.evaluate(training_set) summary.show()
def buil_lrmodel(path): df = load_data(path) #-------------------- preparing the dataset ------------------------------------------- avg_age = find_avg_age(df) df = data_preparation(df, avg_age) print "count = ", df.count() df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') #------------------ Build a model ---------------------------------------------------- lr = LogisticRegression(maxIter=10, regParam=0.01) model = lr.fit(df) prediction = model.transform(df) prediction.show(truncate=False) evaluator = BinaryClassificationEvaluator() print "classification evaluation :", evaluator.evaluate(prediction) #-------------- selecting models with cross validation ----------------------------------- lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\ .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :", evaluator.evaluate(prediction) return cvModel, avg_age
def buil_lrmodel(path): df = load_data(path) #-------------------- preparing the dataset ------------------------------------------- avg_age = find_avg_age(df) df = data_preparation(df, avg_age) print "count = " , df.count() df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') #------------------ Build a model ---------------------------------------------------- lr = LogisticRegression(maxIter=10, regParam=0.01) model = lr.fit(df) prediction = model.transform(df) prediction.show(truncate=False) evaluator = BinaryClassificationEvaluator() print "classification evaluation :" , evaluator.evaluate(prediction) #-------------- selecting models with cross validation ----------------------------------- lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\ .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def build_model(df): """ this function implements three models: logistic regression, decision trees and random forest :df: processed dataframe :result: None """ # Split the data into train and test sets train_data, test_data = df.randomSplit([.8,.2],seed=7) print("Training Dataset Count: {0}".format(train_data.count())) print("Test Dataset Count: {0}".format(test_data.count())) print('') print('training Logitic Regression') lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10) lr_model = lr.fit(train_data) training_summary = lr_model.summary lr_predictions = lr_model.transform(test_data) evaluator = BinaryClassificationEvaluator() print('Logistic Regression Test Area Under ROC = {0}'.format(evaluator.evaluate(lr_predictions))) print('\ntraining Decission Tree') dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3) df_model = dt.fit(train_data) df_predictions = df_model.transform(test_data) evaluator = BinaryClassificationEvaluator() print('Decission Tree Test Area Under ROC = {0}'.format(evaluator.evaluate(df_predictions))) print('\ntraining Random Forest') rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label') rf_model = rf.fit(train_data) rf_predictions = rf_model.transform(test_data) evaluator = BinaryClassificationEvaluator() print('Random Forest Test Area Under ROC = {0}'.format(evaluator.evaluate(rf_predictions)))
def binomial_logistic_regression(trainingDataFrame, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, aggregationDepth=2): lr = LogisticRegression(maxIter=maxIter, regParam=regParam, elasticNetParam=elasticNetParam, tol=tol, fitIntercept=fitIntercept, standardization=standardization, aggregationDepth=aggregationDepth) lrModel = lr.fit(trainingDataFrame) result = {} result["model"] = lrModel result["summary"] = lrModel.summary # https://goo.gl/i5UFA6 result["intercept"] = lrModel.intercept result["coefficients"] = lrModel.coefficients return result
def linear_classifier_run(df_training, df_test, whichModel, isSmallSet = False): # gather train and test sets, if small set include Sex for accuracy testing train = gather_features(df_training).select("Scaled_features", "Sex") if isSmallSet == True: test = gather_features(df_test).select("Scaled_features", "Sex") else: test = gather_features(df_test, isTestSet = True).select("Scaled_features") # select classifier if whichModel == 'logisticRegression': classifier = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10) elif whichModel == 'onevsall': lr = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter=10) classifier = OneVsRest(classifier=lr, labelCol="Sex", featuresCol="Scaled_features") elif whichModel == 'decisionTree': classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features", maxDepth = 3) elif whichModel == 'randomForest': classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features") elif whichModel == 'gbt': classifier = GBTClassifier(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10) elif whichModel == 'nb': classifier = NaiveBayes(labelCol="Sex", featuresCol="Scaled_features", smoothing=1.0, modelType="multinomial") else: raise NameError("Model must be one of the following: logisticRegression, onevsall, decisionTree, randomForest, gbt or nb") # train the model with selected classifier model = classifier.fit(train) # predict test set print('Predicting with ', input_linear_method) predict_test = model.transform(test) # write to a text file predict_test.select('prediction').rdd.map(lambda x : str(int(x[0]))).saveAsTextFile(output_file) print('Output has been written to txt file') # test accuracy if small set if isSmallSet == True: results = predict_test.select("Sex","prediction").withColumn('Success', (predict_test['Sex'] == predict_test['prediction'])) print('Accuracy of', whichModel, '= ', results.select("Success").where("Success == true").count() / results.count())
def logstic_regression_usecase(): """ maxIter:最大迭代次数 regPram:正则化强度 elasticNetParam:用于指定L1和L2正则影响的权重 """ spark = getSparkSession() training = spark.read.format("libsvm").load("../data/lib_svm.txt") #use the multinomial family for binary classification mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model mlrModel = mlr.fit(training) # Print the coefficients and intercepts for logistic regression with multinomial family print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix)) print("Multinomial intercepts: " + str(mlrModel.interceptVector)) trainingSummary = mlrModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective) # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. # 获取roc假正率和召回率数据 trainingSummary.roc.show() # 获取roc曲线下方面积 print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # Set the model threshold to maximize F-Measure # 获取不通过阈值下的调和平均数 fMeasure = trainingSummary.fMeasureByThreshold fMeasure.show()
def anom_with_lr(): try: prepared_data = split_data() train = prepared_data['train'] test = prepared_data['test'] for_finding_more = prepared_data['for_finding_more'] lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do #any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection. #With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points. t0 = time() model = lr.fit(train) tt = time() - t0 print "Classifier trained in {0} seconds".format(round(tt,3)) t0 = time() predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only tt = time() - t0 print "Prediction made in {0} seconds".format(round(tt,3)) #Adding proabability to test data set for calibration labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5))) labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression') test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size) fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count() print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float for_finding_more = for_finding_more.toDF(["label", "predicted_prob"]) for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc()) for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has #probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99 except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return
def run_logistic_regression(tn_data, ts_data): lr = LogisticRegression(elasticNetParam=0.5, regParam=0.01, featuresCol="scaled_features", labelCol="output", weightCol="classWeights", predictionCol="prediction") # Fit the model lrModel = lr.fit(tn_data) predict_train = lrModel.transform(tn_data) predict_test = lrModel.transform(ts_data) evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='output') print_to_output_file("The area under ROC for train set is " + str(evaluator.evaluate(predict_train))) print_to_output_file("The area under ROC for test set is " + str(evaluator.evaluate(predict_test))) trainingSummary = lrModel.summary print_perf_summary(trainingSummary)
def test_should_log_model_with_wildcards_in_allowlist(dataset_binomial, dataset_multinomial): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) ova1_model = ova1.fit(dataset_multinomial) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.*", "pyspark.ml.classification.LogisticRegressionModel", "pyspark.ml.feature.*", }, ): lr = LinearRegression() with mlflow.start_run(): lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) with mlflow.start_run(): lor_model = lor.fit(dataset_binomial) assert _should_log_model(lor_model) assert not _should_log_model(ova1_model)
def test_multiclass_logistic_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], [])), (2.0, 2.0, Vectors.dense(2.0)), (2.0, 2.0, Vectors.dense(1.9))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertAlmostEqual(s.accuracy, 0.75, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) self.assertAlmostEqual(s.weightedRecall, 0.75, 2) self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def logregelastic(self, cor, ip): spark = SparkSession.\ builder.\ appName("LogisticRegressionWithElasticNet").\ master("spark://spark-master:7077").\ config("spark.executor.memory", "512m").\ getOrCreate() #print(len(sys.argv)) #if len(sys.argv) > 2: sample_frac = float(ip) #print("Input frac is : ",sample_frac) num_parts = int(cor) #print("Cores are : ",num_parts) #print(num_parts) #sc = SparkContext(appName="LogisticRegressionWithElasticNet") sc = spark.sparkContext sc.setLogLevel("WARN") sqlContext = SQLContext(sc) # Load training data training = sqlContext.read.format("libsvm").load("/data/rcv1_train.binary/rcv1_train") training = training.sample(False, sample_frac).coalesce(num_parts) lr = LogisticRegression(maxIter=10, elasticNetParam=0.8) #training = np.array(training) #training = np.fromstring(training, dtype=int, sep=',') start = time.time() self.start_t = start # Fit the model #lb = preprocessing.LabelBinarizer() #op = lb.fit_transform(training) lrModel = lr.fit(training) end = time.time() self.end_t = end print("Cores ",num_parts, "LR sample: ", sample_frac, " took ", (end-start))
def run_logistic(self): ''' Method to run logistic regression on our transformed data. Input: ------- None Output: ------- Dictionary of confusion matrix scores for this particular model. ''' # Instantiate model, fit, then transform. lr = LogisticRegression(maxIter=30, regParam=0.3, elasticNetParam=0) lr_model = lr.fit(self.trainingData) predictions = lr_model.transform(self.testData) # Write type of model to filename. with open(self.filename,'a') as f: f.write("\n\nLogistic Regression:") # Create confusion matrix to see how well the model performed confusion_matrix = self.create_confusion_matrix(predictions) # Evaluate model's AUC. auc = self.evaluator.evaluate(predictions) print("AUC Score: ",str(auc)) # Write result of model to filename. with open(self.filename,'a') as f: f.write("\nAUC Score: " + str(auc)) return confusion_matrix
def main(): sc = init() spark = SparkSession(sc) data = load_dataset(spark) train_data, test_data = prepare_dataset(data) #train_data.show() #test_data.show() print("Encontrando h ....") lr = LogisticRegression( maxIter=100, regParam=0.3, elasticNetParam=0.8, labelCol='CLASS', family='binomial') lr_model = lr.fit(train_data) print("Coeficientes: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) print("Testing model...") data_to_validate = lr_model.transform(test_data) evaluator1 = BinaryClassificationEvaluator( labelCol='CLASS', metricName='areaUnderROC', rawPredictionCol='rawPrediction' ) print("{}:{}".format( "areaUnderROC",evaluator1.evaluate(data_to_validate))) evaluator2 = BinaryClassificationEvaluator( labelCol='CLASS', metricName='areaUnderPR', rawPredictionCol='rawPrediction' ) print("{}:{}".format( "areaUnderPR",evaluator2.evaluate(data_to_validate)))
def get_logistic_regression(self, train_data, test_data, col_to_check): print "----------------------------------------------------------------" print "" info = {} lr_churn = LogisticRegression(labelCol=col_to_check) print "this is lr_churn: {}".format(lr_churn) print "this is lr_churn type: {}".format(type(lr_churn)) fitted_churn_model = lr_churn.fit(train_data) print "this is fitted_churn_model: {}".format(fitted_churn_model) print "this is fitted_churn_model type: {}".format( type(fitted_churn_model)) print "what is here:{}".format(dir(fitted_churn_model)) trainning_sum = fitted_churn_model.summary print "trainning sum: {}".format(trainning_sum) print "trainning sum type: {}".format(type(trainning_sum)) trainning_sum.predictions.describe().show() print "should show the predictions above" predictions_and_labels = fitted_churn_model.evaluate(test_data) print "this is fitted_churn_model: {}".format(predictions_and_labels) print "this si fitted_chur_model after evaluate on train data type: {}".format( type(predictions_and_labels)) print "" print "showing predictions and labels count: " print predictions_and_labels.predictions.show(351) print "----------------------------------------------------------------" print "" return predictions_and_labels
def train(train_path, model_name, elasticNetParam=0): if model_name is None: model_name = 'model' model_path = os.path.join(dirname(os.getcwd()), 'models', model_name) if os.path.isdir(model_path): shutil.rmtree(model_path) spark = SparkSession \ .builder \ .master('local') \ .appName('Logistic App') \ .getOrCreate() # todo Delete the next line spark.sparkContext.setLogLevel('OFF') raw_data = spark.read.csv(train_path, header=True) dataset = mature_data(raw_data) lr = LogisticRegression(maxIter=10, elasticNetParam=elasticNetParam) lrModel = lr.fit(dataset) lrModel.save(path=model_path)
def LogisticRegression(trainingData, testData, schemaNames): from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.tuning import CrossValidator from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator import numpy as np import time lr = LogisticRegression(featuresCol='features', labelCol='label', regParam=0.1, maxIter=7) timer = '' start = time.time() cvModel = lr.fit(trainingData) end = time.time() timer = ((end - start) / 60) prediction = cvModel.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") areaUC = evaluator.evaluate(prediction) w_r = cvModel.coefficients w_r = w_r.tolist() feat = [] for i in (w_r)[-3:][::-1]: feat.append(schemaNames[(w_r.index(i))]) return feat, accuracy, areaUC, timer
def logistic_regression_generator(training_data, deal_id): ####In: #A training data set, as generated by data_prep() #The deal_id you want to generate a model for ####Out #The model is saved #An update message is outputted training_data = training_data.withColumnRenamed(deal_id, 'label') model = LogisticRegression(maxIter=100, regParam=0.0001, elasticNetParam=1, family="binomial") model = model.fit(training_data) model.write().overwrite().save( f"s3://rtl-databricks-datascience/lpater/logistic_regression/{deal_id}/" ) output_message = "Saved a Logistic Regression model for " + deal_id + "." #see also: https://spark.apache.org/docs/latest/ml-classification-regression.html #note: this currently uses LASSO to select parameters return output_message
(161.6, 61.2, 28)]).toDF("height", "weight", "age") training.show(truncate=False) assembler = VectorAssembler(inputCols=["height", "weight", "age"], outputCol="features") # training 데이터에 features 컬럼 추가 assembled_training = assembler.transform(training) assembled_training.show(truncate=False) # 모델 생성 알고리즘 (로지스틱 회귀 평가자) lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender") # 모델 생성 model = lr.fit(assembled_training) # 예측값 생성 model.transform(assembled_training).show() # 파이프라인 pipeline = Pipeline(stages=[assembler, lr]) # 파이프라인 모델 생성 pipelineModel = pipeline.fit(training) # 파이프라인 모델을 이용한 예측값 생성 pipelineModel.transform(training).show() path1 = "/Users/beginspark/Temp/regression-model" path2 = "/Users/beginspark/Temp/pipelinemodel"
spark = SparkSession \ .builder \ .appName("MulticlassLogisticRegressionWithElasticNet") \ .getOrCreate() # $example on$ # Load training data training = spark \ .read \ .format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for multinomial logistic regression print("Coefficients: \n" + str(lrModel.coefficientMatrix)) print("Intercept: " + str(lrModel.interceptVector)) trainingSummary = lrModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective) # for multiclass, we can inspect metrics on a per-label basis print("False positive rate by label:")
# COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="label",featuresCol="features") # COMMAND ---------- print lr.explainParams() # COMMAND ---------- fittedLR = lr.fit(train) # COMMAND ---------- train, test = df.randomSplit([0.7, 0.3]) # COMMAND ---------- rForm = RFormula() lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features") # COMMAND ----------
from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator print "Fitting the classifier on selected features" t0 = time() string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') string_indexer_model = string_indexer.fit(dfTrainSelect) dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[19]: print "Testing precision of the model" t0 = time() dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache() dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache() df_valid_pred = lrModel.transform(dfValidIndexed).cache() res=evaluator.evaluate(df_valid_pred) print res
dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"]) from pyspark.ml.feature import VectorAssembler va = VectorAssembler(outputCol="features", inputCols=dfhot.columns[0:-1]) lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label") #section 8.2.3 splits = lpoints.randomSplit([0.8, 0.2]) adulttrain = splits[0].cache() adultvalid = splits[1].cache() from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts)
print "Features created" from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol()) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') lr_model = lr.fit(featIndexed) dfTestTok = tokenizer.transform(dfTest) featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) testIndexed = string_indexer_model.transform(featuresTest) df_test_pred = lr_model.transform(testIndexed) res=evaluator.evaluate(df_test_pred) print res #test,names=lf.loadUknown('./data/test') #name_text=zip(names,test) ##for each doc :(name,text): ##apply the model on the vector representation of the text
elastic_net_param = 0.1 """ for reg_param in RP: lr = LogisticRegression(maxIter = max_iter, regParam=reg_param,elasticNetParam = elastic_net_param,standardization = stand) lr = lr.fit(trainDF) validateDF_prob = add_probability(validateDF,lr,sc) print "======================" print "averaged log_loss: ", temp = log_loss(validateDF_prob) print temp if temp < Opt: Opt = temp reg_param_opt = reg_param elastic_net_param_opt = elastic_net_param """ elastic_net_param_opt = 5e-3 reg_param_opt = 1e-6 lr = LogisticRegression(maxIter = max_iter, regParam=reg_param_opt,elasticNetParam = elastic_net_param_opt,standardization = stand) lr = lr.fit(trainDF) predictions = add_probability(testDF,lr,sc).select("activity_id","outcome") predictions = predictions.join(leakageTest,"activity_id","left_outer").withColumnRenamed("outcome","p") predictions = predictions.withColumn("outcome", when( isNull(predictions.leak), predictions.leak).otherwise(predictions.p).alias("outcome")) predictions.show(5) predictions = predictions.select("activity_id","outome") predictions.toPandas().to_csv(datapath+"lr.csv",index = False) #predictions = predictions.select(predictions.probability.values) #predictions.show(3) #predictions = predictions.select("activity_id",predictions.outcome.getItem(1).alias("outcome"))
USE_SVM = True USE_LR = False USE_DT = False # Read Data sqlContext = SQLContext(sc) trainData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('flight/*.csv') testData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('test/*.csv') #Preprocess Data trainData = preprocess(trainData) testData = preprocess(testData) #Logistic Regression lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) lrModel = lr.fit(trainData) lrprediction = lrModel.transform(testData) lrselected = lrprediction.select("probability").first().probability[0] result="Logistic Regression Accuracy:"+str(lrselected)+'\n' #Decision Tree Regression dataset = trainData.unionAll(testData) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dataset) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. dcmodel = pipeline.fit(trainData)
test = test.select('Pclass', 'Sex', 'SibSp', 'Parch') train = sex_to_bin(train) test = sex_to_bin(test) print "number of men in train and test resp. : %d, %d" \ %(train.select('Sex').map(lambda x: x.Sex).sum() \ ,test.select('Sex').map(lambda x: x.Sex).sum()) # format train for Logistic Regression as (label, features) ntrain = train.map(lambda x: Row(label = float(x[0]) \ ,features = Vectors.dense(x[1:]))).toDF().cache() # Logistic Regression is iterative, need caching ntest = test.map(lambda x: Row(features = Vectors.dense(x[0:]))).toDF() lr = LogisticRegression(maxIter = 100, regParam = 0.1) model = lr.fit(ntrain) pred = model.transform(ntest).select('prediction').map(lambda x: x.prediction) # configure the submission format as follows submit = sqlCtx.createDataFrame(testPassengerId.zip(pred), ["PassengerId", "Survived"]) """ NOTE: rdd1.zip(rdd2) works provided that both RDDs have the same partitioner and the same number of elements per partition, otherwise should either repartition or can do: submit = sqlCtx.createDataFrame(pred.zipWithIndex().map(lambda x: (x[1]+892L, x[0])), ["PassengerId", "Survived"]) where 891L is the number training samples """ os.chdir(DATADIR) # file is small so can save pandas.DataFrame as csv submit.toPandas().to_csv("prediction.csv", index = False) # if not, should saveAsTextFile: # submit.rdd.saveAsTextFile("/home/ehsan/Python/PySpark/Titanic/data/prediction")
#choose estimator and grid #estima = NaiveBayes() #grid = ParamGridBuilder().addGrid(5, [0, 2]).build() lr = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",maxIter=20) #choose the model grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() #la grille est construite pour trouver le meilleur parametre 'alpha' pour le terme de regularisation du modele: c'est un 'elastic Net' #max.iter vaut 30 par defaut, on pourrait changer sa valeur #on va donc essayer 30 valeur entre 0 et 1 #alpha=0 c'est une regularisation L2, #alpha=1, c'est une regularisation L1 print "Cross validation debut" evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName='precision') #choose the evaluator cv = CrossValidator(estimator=lr, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter #cvModel = cv.fit(dfTrain) #train the model on the whole training set model = lr.fit(dfTrain) resultat=evaluator.evaluate(model.transform(dfTest)) #compute the percentage of success on test set print "Pourcentage de bonne classification(0-1): ",resultat ##Train NaiveBayes #model=NaiveBayes.train(labeledRDD) ##broadcast the model #mb=sc.broadcast(model) # #test,names=lf.loadUknown('./data/test') #name_text=zip(names,test) ##for each doc :(name,text): ##apply the model on the vector representation of the text ##return the name and the class #predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect() #
def train_logistic(df): lr = LogisticRegression(maxIter=LR_MAX_ITER, regParam=LR_REG_PARAM) return lr, lr.fit(df)
Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])), Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])), Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])), Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))]) # Create a LogisticRegression instance with maxIter = 10. # This instance is an Estimator. lr = LogisticRegression(maxIter=10) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # We may also set parameters using setter methods. lr.setRegParam(0.01) # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a Transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters:\n") pprint.pprint(model1.extractParamMap()) # We may alternatively specify parameters using a parameter map. # paramMap overrides all lr parameters set earlier. paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"} # Now learn a new model using the new parameters. model2 = lr.fit(training, paramMap) print("Model 2 was fit using parameters:\n")
sqlContext = SQLContext(sc) # Prepare training data from a list of (label, features) tuples. training = sqlContext.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters: ") print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params. # You can combine paramMaps, which are python dictionaries. paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
from logreg import collect_one with SparkController() as sc: data_path, npar = './data/a9a', 5 dataset = MLUtils.loadLibSVMFile(sc, data_path, minPartitions=npar).cache() local_data = Worker.from_rows(dataset.collect(), dense=False) n, d = local_data.n_samples, local_data.n_features print '#samples: {n}; #features: {d}'.format(n=n, d=d) print 'Baseline: training in single node mode...' prob = Executor(local_data, n, d, collect_one, logreg_local, cached=True, l2_reg=0.01) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f') print 'Spark ({} partitions): training using peregrine...'.format(npar) prob = logistic_regression(dataset, dense=False, l2_reg=0.01) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f') print 'Spark ({} partitions): training using mllib...'.format(npar) sqlContext = SQLContext(sc) lr = LogisticRegression(maxIter=300, regParam=0.02, elasticNetParam=0.5, fitIntercept=False) lr.fit(dataset.toDF().replace(-1, 0, 'label').cache()) print 'Spark/Tensorflow ({} partitions): training using peregrine...'.format(npar) prob = logistic_regression(dataset, l2_reg=0.01, tensorflow=True) descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')
# MAGIC # MAGIC You can read more about Logistic Regression from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-linear-methods.html#logistic-regression). In the new Pipelines API, we are now able to perform Elastic net regularization with Logistic Regression, as well as other linear methods. # MAGIC # MAGIC # MAGIC Note: As of Spark 1.5.0, The Python API does not yet support multiclass classification for Logistic Regression, but will be available in future. # COMMAND ---------- from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params # Create initial LogisticRegression model lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # Train model with Training Data lrModel = lr.fit(trainingData) # COMMAND ---------- # Make predictions on test data using the Transformer.transform() method. # LogisticRegression.transform() will only use the 'features' column. predictions = lrModel.transform(testData) # COMMAND ---------- predictions.printSchema() # COMMAND ---------- # View model's predictions and probabilities of each prediction class # You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
from __future__ import print_function # $example on$ from pyspark.ml.classification import LogisticRegression # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("LogisticRegressionWithElasticNet")\ .getOrCreate() # $example on$ # Load training data training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) # $example off$ spark.stop()