def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
Exemplo n.º 3
0
def train_with_tune(input_df):
    # https://spark.apache.org/docs/latest/ml-tuning.html
    # 构建模型训练流程
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])

    # 构建超参空间
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    # 只做一次切分
    # tvs = TrainValidationSplit(estimator=pipeline,
    #                            estimatorParamMaps=paramGrid,
    #                            evaluator=BinaryClassificationEvaluator(),
    #                            # 80% of the data will be used for training, 20% for validation.
    #                            trainRatio=0.8)

    # k-fold cross validation
    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(),
                               numFolds=3)

    # train and find the best
    cvModel = cross_val.fit(input_df)
    return cvModel.bestModel
Exemplo n.º 4
0
def buildModel(data, label):
    """
    Build a pipeline to classify `label` against the rest of classes using Binary Regression Classification

    :param data: the training data as a DF
    :param label: 0..C-1 where C is the number of classes
    :param shouldDisplayGraph: True to plot the graph illustrating the classification
    :return: the model as a Transformer
    """
    logging.info('building model for label = %d, type = %s' % (label, type(label)))
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])

    paramGrid = ParamGridBuilder()\
        .addGrid(lr.maxIter, [100])\
        .addGrid(lr.elasticNetParam, [0.0, 1.0])\
        .addGrid(lr.fitIntercept, [True, False])\
        .build()
    crossValidator = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                                    evaluator=BinaryClassificationEvaluator(), numFolds=15)

    dataDF = data.map(lambda point: LabeledPoint(0 if point.label == label else 1, point.features)).toDF()
    model = crossValidator.fit(dataDF)

    return model
Exemplo n.º 5
0
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Exemplo n.º 6
0
def pipelineRF(dataDF):
    """

    :param train_data:
    :return:
    """

    print('pipeline starting...')
    labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF)
    featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\
                                    .fit(dataDF)

    #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5,
    #                                      maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy')

    rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features',
                                         maxBins=40,seed=13)

    pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator])

    paramGrid = ParamGridBuilder()\
        .addGrid(rfEstimator.maxDepth,[5,10,30])\
        .addGrid(rfEstimator.numTrees,[20,50,100]).build()

    evaluator =BinaryClassificationEvaluator(labelCol='indexLabel',
                                             rawPredictionCol='rawPrediction',
                                             metricName='areaUnderROC')
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)

    cvModel = cv.fit(dataDF)
    print("pipeline end..., cvModel  was fit using parameters:\n")
    pprint(cvModel.explainParams())


    predictionDF = cvModel.transform(dataDF)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.take(5):
        print row

    aucMetric = evaluator.evaluate(selected)
    print("auc of test data is:%.3f" % aucMetric)
Exemplo n.º 7
0
def create_models(sqlContext, modelDataframe):
    modelDataframe.registerTempTable("modelDataframeTable")

    # Create dataframes to use on the positive and negative models
    pos = sqlContext.sql("SELECT pos_label AS label, features FROM modelDataframeTable")
    neg = sqlContext.sql("SELECT neg_label AS label, features FROM modelDataframeTable")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
    neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(
        estimator=poslr,
        evaluator=posEvaluator,
        estimatorParamMaps=posParamGrid,
        numFolds=2)
    negCrossval = CrossValidator(
        estimator=neglr,
        evaluator=negEvaluator,
        estimatorParamMaps=negParamGrid,
        numFolds=2)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.write().overwrite().save("models/posModel")
    negModel.write().overwrite().save("models/negModel")
Exemplo n.º 8
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
Exemplo n.º 11
0
    def test_fit_minimize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
Exemplo n.º 12
0
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
Exemplo n.º 13
0
    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for CrossValidator will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        lrModel = cvModel.bestModel

        cvModelPath = temp_path + "/cvModel"
        lrModel.save(cvModelPath)
        loadedLrModel = LogisticRegressionModel.load(cvModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
Exemplo n.º 14
0
    def test_copy(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvCopied = cv.copy()
        self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid)

        cvModel = cv.fit(dataset)
        cvModelCopied = cvModel.copy()
        for index in range(len(cvModel.avgMetrics)):
            self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index])
                            < 0.0001)
Exemplo n.º 15
0
def train_with_tune(input_df):
    # https://spark.apache.org/docs/latest/ml-tuning.html
    # build a model with GridSearch
    xgboost_params = {
        "eta": 0.023,
        "max_depth": 10,
        "min_child_weight": 0.3,
        "subsample": 0.7,
        "colsample_bytree": 0.82,
        "colsample_bylevel": 0.9,
        "eval_metric": "auc",
        "seed": 49,
        "silent": 1,
        "objective": "binary:logistic",
        "round": 10,
        "nWorkers": 2
    }
    xgb_model = XGBoostClassifier(xgboost_params)
    pipeline = Pipeline(stages=[xgb_model])

    # build the hyperparameter space
    paramGrid = ParamGridBuilder() \
        .addGrid(xgb_model.max_depth, [3, 7]) \
        .addGrid(xgb_model.min_child_weight, [0.1, 0.2, 0.3]) \
        .build()


    # k-fold cross validation
    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(rawPredictionCol="probabilities"),
                               numFolds=3)

    # train and find the best
    cvModel = cross_val.fit(input_df)
    return cvModel.bestModel
Exemplo n.º 16
0
    def test_save_load_nested_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(100)
        lr2 = LogisticRegression().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)

        originalParamMap = cv.getEstimatorParamMaps()
        loadedParamMap = loadedCV.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Exemplo n.º 17
0
    def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cv.setParallelism(1)
        cvSerialModel = cv.fit(dataset)
        cv.setParallelism(2)
        cvParallelModel = cv.fit(dataset)
        self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
Exemplo n.º 18
0
    lr = LogisticRegression(maxIter=10)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
    # This will allow us to jointly choose parameters for all Pipeline stages.
    # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
    # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=2)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(training)

    # Prepare test documents, which are unlabeled.
    Document = Row("id", "text")
    test = sc.parallelize([(4L, "spark i j k"),
                           (5L, "l m n"),
                           (6L, "mapreduce spark"),
                           (7L, "apache hadoop")]) \
        .map(lambda x: Document(*x)).toDF()

    # Make predictions on test documents. cvModel uses the best model found (lrModel).
    prediction = cvModel.transform(test)
Exemplo n.º 19
0
# In[330]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


# In[331]:

from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
     .baseOn([evaluator.metricName,'precision'])
     .addGrid(dt.maxDepth, [10,20])
     .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)


# In[332]:

print "Fitting the decision tree on selected features"
t0 = time()
cv_model = cv.fit(dfTrainIndexed)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[302]:

pr
dfTestIndexed = string_indexer_model.transform(dfTestSelect)
# COMMAND ----------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# COMMAND ----------

grid = ParamGridBuilder() \
  .addGrid(dtc.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \
  .addGrid(dtc.maxBins, [2, 4, 8]) \
  .build()

# COMMAND ----------

cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=3)

# COMMAND ----------

# MAGIC %md Run `CrossValidator`.  `CrossValidator` checks to see if an MLflow tracking server is available.  If so, it log runs within MLflow:
# MAGIC
# MAGIC * Under the current active run, log info for `CrossValidator`.  (Create a new run if none are active.)
# MAGIC * For each submodel (number of folds of cross-validation x number of ParamMaps tested)
# MAGIC   * Log a run for this submodel, along with the evaluation metric on the held-out data.

# COMMAND ----------

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once,
Exemplo n.º 21
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Setting Random Forest Paramaters From Users
user_svm_param_maxIter = [16, 32, 64, 128]
user_svm_param_numFolds = 3

#Settings for Random Forest - Paramaters Grid Search
svm_paramGrid = ParamGridBuilder().addGrid(svmclassifier.maxIter,
                                           user_svm_param_maxIter).build()
evaluator = BinaryClassificationEvaluator()
multiEvaluator = MulticlassClassificationEvaluator()

#Setting Paramaters for Crossvalidation
svm_cv = CrossValidator(estimator=pipeline,
                        evaluator=evaluator,
                        estimatorParamMaps=svm_paramGrid,
                        numFolds=user_svm_param_numFolds)
svm_cvmodel = svm_cv.fit(train)

#Evaluating Random Forest Model Performance
from pyspark.sql.functions import udf

svm_predictions = svm_cvmodel.transform(test)
auroc = evaluator.evaluate(svm_predictions,
                           {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(svm_predictions,
                          {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s" % (auroc, aupr)

f1score = multiEvaluator.evaluate(svm_predictions,
                                  {multiEvaluator.metricName: "f1"})
Exemplo n.º 22
0

dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10)


from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
     .baseOn([evaluator.metricName,'precision'])
     .addGrid(dt.maxDepth, [10,20])
     .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)

from time import time
print "Start fitting"
t0 = time()
cv_model = cv.fit(featIndexed)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt,3))

print "Start preprocessing test data"
t0 = time()

dfTestTok = tokenizer.transform(dfTest)
dfTestBigram = bigram.transform(dfTestTok)
featuresTest=dfTestBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
Exemplo n.º 23
0
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()

print "Grid is build"

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

print "CV Estimator is defined"

cv_model = cv.fit(dfTrain)

print "Model is fitted"

df_test_pred = cv_model.transform(dfTest)

print "Labels are predicted"

print evaluator.evaluate(df_test_pred)
Exemplo n.º 24
0
rfModel = model.stages[2]
print(rfModel)  # summary only

### LR

lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [100, 1000, 10000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)


# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
prediction

selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

accuracy = evaluator.evaluate(predictions)
Exemplo n.º 25
0
from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext("local", "Pipeline")
sqlContext = SQLContext(sc)
dataset = sqlContext.createDataFrame(
    [(Vectors.dense([0.0]), 0.0),
     (Vectors.dense([0.4]), 1.0),
     (Vectors.dense([0.5]), 0.0),
     (Vectors.dense([0.6]), 1.0),
     (Vectors.dense([1.0]), 1.0)] * 10,
    ["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
print(cv.metrics)


Params map: [ 2.80026035  2.77896443  2.52157438  2.77129878  2.68407165  2.29883198]
Metrics: [
    {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x105452128>, Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576},
    {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x105452128>, Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 2, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576},
    {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x105452128>, Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 3, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576},
    {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): WhitespaceTokenizer(pattern='\\s+', gaps=True, discard_empty=True, flags=56), Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576},
    {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): WhitespaceTokenizer(pattern='\\s+', gaps=True, discard_empty=True, flags=56), Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 2, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576},
    {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): WhitespaceTokenizer(pattern='\\s+', gaps=True, discard_empty=True, flags=56), Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 3, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}]

Params map: [ 2.80151508]
Metrics: [{Param(parent='NGram_4b338738c901a197c6db', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='Tokenizzzer_4ef1808e516e9e78d783', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x1054524a8>, Param(parent='HashingTF_4623a4e0650badd052c3', name='numFeatures', doc='number of features.'): 1048576}]
from pyspark.ml.tuning import ParamGridBuilder

model_new = ALS(userCol="userId",
                itemCol="movieId",
                ratingCol="rating",
                nonnegative=True,
                coldStartStrategy="drop")

# Parameters for tuning
paramGrid = ParamGridBuilder().addGrid(model_new.regParam,
                                       [0.1, 0.01, 0.001]).addGrid(
                                           model_new.rank,
                                           [5, 10, 15]).build()

crossvalidation = CrossValidator(estimator=model_new,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=evaluator,
                                 numFolds=10)

#Using the Best Model
model_cv = crossvalidation.fit(training).bestModel

#Evaluate and print the predictions
print("RMSE value after solving cold start problem is: ",
      evaluator.evaluate(model_cv.transform(test)))

# As we can see, even after CV there isn't much improvement.

# # Step 5. Top 10 movies for all the users

# In[33]:
pl_rff = basePipeline + [rff]
pg_rff = ParamGridBuilder()\
      .baseOn({pipeline.stages: pl_rff})\
      .build()

# One grid from the individual grids
paramGrid = pg_lr + pg_dt + pg_rff

# COMMAND ----------

# The regression metric can be  rmse, r2
# See the metrics here https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html#regression-model-evaluation
# Should run more than 3 folds, but here we simplify so that it will complete
cv = CrossValidator()\
      .setEstimator(pipeline)\
      .setEvaluator(RegressionEvaluator()\
                       .setMetricName("r2"))\
      .setEstimatorParamMaps(paramGrid)\
      .setNumFolds(3)

cvModel = cv.fit(df1)

# COMMAND ----------

# MAGIC %md ## Best and Worst Model

# COMMAND ----------

import numpy as np
# RegressionEvaluator metric name is r2, so higher is better
# http://gim.unmc.edu/dxtests/roc3.htm
print("Best Model")
Exemplo n.º 28
0
def main(context):
    """Main Function takes a Spark SQL Context."""
    #---------------------------------------------------------------------------
    # TASK 1
    # Code for task 1...
    # df = context.read.csv('labeled_data.csv')
    # df.write.parquet("labeled_data.parquet")
    # comments = context.read.json("comments-minimal.json.bz2")
    # comments.write.parquet("comments.parquet")
    # submissions = context.read.json("submissions.json.bz2")
    # submissions.write.parquet("submissions.parquet")
    labeled_data = context.read.parquet('labeled_data.parquet')
    labeled_data = labeled_data.withColumnRenamed("_c0", "Input_id")\
                               .withColumnRenamed("_c1", "labeldem")\
                               .withColumnRenamed("_c2", "labelgop")\
                               .withColumnRenamed("_c3", "labeldjt")
    # labeled_data.show()
    comments = context.read.parquet('comments.parquet')
    # comments.show()
    submissions = context.read.parquet('submissions.parquet')
    # submissions.show()

    #---------------------------------------------------------------------------
    # TASK 2
    # Code for task 2...
    labeled_comments = labeled_data.join(comments,
                                         comments.id == labeled_data.Input_id)
    labeled_comments = labeled_comments.select('Input_id', 'labeldjt', 'body')
    # labeled_comments.show()

    #---------------------------------------------------------------------------
    # TASK 4
    # Code for task 4...
    sanitize_udf = udf(sanitize, ArrayType(StringType()))

    #---------------------------------------------------------------------------
    # TASK 5
    # Code for task 5...
    sanitized_labeled_comments = labeled_comments.select(
        'Input_id', 'labeldjt',
        sanitize_udf('body').alias('raw'))

    #---------------------------------------------------------------------------
    # TASK 6A
    # Code for task 6A...
    cv = CountVectorizer(binary=True,
                         minDF=10.0,
                         inputCol="raw",
                         outputCol="features")
    model = cv.fit(sanitized_labeled_comments)
    sanitized_labeled_comments = model.transform(sanitized_labeled_comments)
    sanitized_labeled_comments.show(truncate=False)
    countVectorizerPath = "count_vectorizer_model"
    model.save(countVectorizerPath)

    #---------------------------------------------------------------------------
    # TASK 6B
    # Code for task 6B...
    # Labels: {1, 0, -1, -99}
    pos = sanitized_labeled_comments.select(
        sanitized_labeled_comments.features,
        sanitized_labeled_comments.labeldjt.cast(IntegerType()))
    pos = pos.withColumnRenamed("labeldjt", "label")
    pos = pos.replace(-1, 0)
    pos = pos.replace(-99, 0)
    # pos.show()

    neg = sanitized_labeled_comments.select(
        sanitized_labeled_comments.features,
        sanitized_labeled_comments.labeldjt.cast(IntegerType()))
    neg = neg.withColumnRenamed("labeldjt", "label")
    neg = neg.replace(1, 0)
    neg = neg.replace(-99, 0)
    neg = neg.replace(-1, 1)
    # neg.show()

    #---------------------------------------------------------------------------
    # TASK 7
    # Code for task 7...
    # ... MACHINE LEARNING PORTION TO TRAIN MODELS - Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)
    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")

    # Positive Model: posModel
    # Negative Model: negModel

    #---------------------------------------------------------------------------
    # TASK 8
    # Code for task 8...
    # ... Make Final Deliverable for Unseen Data - We don't need labeled_data anymore
    strip_t3_udf = udf(strip_t3, StringType())
    sarcastic_or_quote_udf = udf(sarcastic_or_quote, BooleanType())
    # Get Unseen Data
    sanitized_final_deliverable = comments.select('created_utc', strip_t3_udf(comments.link_id).alias('link_id'), 'author_flair_text', 'id', 'body', 'gilded', sanitize_udf('body').alias('raw'), comments.score.alias('c_score'))\
        .filter(sarcastic_or_quote_udf(comments['body'])) #F.when(comments["body"].rlike('^&gt|\/s'), False).otherwise(True))
    # sanitized_final_deliverable.show()

    #---------------------------------------------------------------------------
    # TASK 9
    # Code for task 9...
    # Load models that we saved on previous runs of this script
    model = CountVectorizerModel.load("count_vectorizer_model")
    posModel = CrossValidatorModel.load("project2/pos.model")
    negModel = CrossValidatorModel.load("project2/neg.model")

    # Sanitize TASK 8 - Run the CountVectorizerModel on TASK 8 Relation
    sanitized_final_deliverable = model.transform(sanitized_final_deliverable)

    # Run classifier on unseen data to get positive labels
    posResult = posModel.transform(sanitized_final_deliverable)
    # Rename the 3 new columns to prevent name conflicts
    posResult = posResult.withColumnRenamed("probability", "probability_pos")\
                         .withColumnRenamed("rawPrediction", "rawPrediction_pos")\
                         .withColumnRenamed("prediction", "prediction_pos")
    # Run the classifier on previous positive result to get negative labels too
    result = negModel.transform(posResult)
    # Rename the 3 new columns to make it easier to see which is which
    result = result.withColumnRenamed("probability", "probability_neg")\
                    .withColumnRenamed("rawPrediction", "rawPrediction_neg")\
                    .withColumnRenamed("prediction", "prediction_neg")

    # UDF functions for predicting label based on thresholds
    predict_pos_udf = udf(predict_pos, IntegerType())
    predict_neg_udf = udf(predict_neg, IntegerType())

    # Make predictions based on probability and threshold:
    result = result.select('created_utc', 'author_flair_text', 'link_id', 'id', 'c_score', 'gilded',\
                                 predict_pos_udf(result.probability_pos).alias('pos'),\
                                 predict_neg_udf(result.probability_neg).alias('neg'))

    result.write.parquet("result.parquet")
    # result.show()

    #---------------------------------------------------------------------------
    # TASK 10
    # Code for task 10...
    # ... Perform Analysis on the Predictions
    result = context.read.parquet("result.parquet")
    submissions = submissions.select('id', 'title',
                                     submissions.score.alias('s_score'))
    result = result.join(submissions,
                         result.link_id == submissions.id)  # .explain()
    result.show()
    context.registerDataFrameAsTable(result, "result")

    # 1. Percentage of Comments that Were Positive/Negative Across ALL Submissions
    task_10_1 = context.sql(
        "SELECT title, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY title"
    )
    task_10_1.show()

    task_10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_1.csv")

    # 2. Percentage of Comments that Were Positive/Negative Across ALL Days
    task_10_2 = context.sql(
        "SELECT FROM_UNIXTIME(created_utc, 'Y-M-d') AS day, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY day ORDER BY day asc"
    )
    task_10_2.show()

    task_10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_2.csv")

    # 3. Percentage of Comments that Were Positive/Negative Across ALL States
    context.registerFunction("check_state_udf", check_state, BooleanType())
    task_10_3 = context.sql(
        "SELECT author_flair_text AS state, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result WHERE check_state_udf(author_flair_text) = True GROUP BY state"
    )
    task_10_3.show()

    task_10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_3.csv")

    # 4A. Percentage of Comments that Were Positive/Negative Across ALL Comments
    task_10_4A = context.sql(
        "SELECT c_score AS comment_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY comment_score"
    )
    task_10_4A.show()

    task_10_4A.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_4A.csv")

    # 4B. Percentage of Comments that Were Positive/Negative Across ALL Story Scores
    task_10_4B = context.sql(
        "SELECT s_score AS submission_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY submission_score"
    )
    task_10_4B.show()

    task_10_4B.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_4B.csv")

    #---------------------------------------------------------------------------
    # Extra Credit (Task 10)
    # 1. Percentage of Comments that Were Positive/Negative For Gilded and Non-Gilded Comments
    task_10_extra_credit = context.sql(
        "SELECT gilded, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY gilded"
    )
    task_10_extra_credit.show()

    task_10_extra_credit.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task_10_extra_credit.csv")
Exemplo n.º 29
0
def train(inputs_path: str):

    spark = SparkUtils.build_or_get_session('training')
    df_kids = spark.read.parquet(inputs_path)
    label_col = 'final_status'

    mlflow_tracking_ui = 'http://35.246.84.226'
    mlflow_experiment_name = 'kickstarter'

    mlflow.set_tracking_uri(mlflow_tracking_ui)
    mlflow.set_experiment(experiment_name=mlflow_experiment_name)

    numerical_columns = ['days_campaign', 'hours_prepa', 'goal']
    categorical_columns = ['country_clean', 'currency_clean']
    features = numerical_columns + categorical_columns

    df = df_kids.select(features + [label_col])

    max_iter = 15
    model_specs: Pipeline = build_model(
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
        label_col=label_col,
        max_iter=max_iter)

    df_train, df_test = df.randomSplit([0.8, 0.2], seed=12345)
    df_train = df_train.cache()

    evaluator = BinaryClassificationEvaluator() \
        .setMetricName('areaUnderROC') \
        .setRawPredictionCol('rawPrediction') \
        .setLabelCol('final_status')

    gbt = model_specs.getStages()[-1]

    params_grid = ParamGridBuilder()\
        .addGrid(gbt.maxDepth, [6]) \
        .addGrid(gbt.maxIter, [15]) \
        .addGrid(gbt.maxBins, [32])\
        .build()

    cross_val = CrossValidator(estimator=model_specs,
                               estimatorParamMaps=params_grid,
                               evaluator=evaluator,
                               numFolds=2)

    with mlflow.start_run() as active_run:
        logger.info(f'Cross evaluating model on {df_train.count()} lines')

        cross_val_model: CrossValidatorModel = cross_val.fit(df_train)
        model = cross_val_model.bestModel

        logger.info('Evaluating model')
        train_metrics = evaluator.evaluate(model.transform(df_train))
        metrics = {'train_auc': train_metrics}

        test_metrics = evaluator.evaluate(model.transform(df_test))
        metrics.update({'test_auc': test_metrics})
        logger.info(f'Model metrics: {metrics}')

        logger.info('Logging to mlflow')
        mlflow_params = {'model_class': 'gbt', 'max_iter': max_iter}
        mlflow.log_params(mlflow_params)
        mlflow.log_metrics(metrics)
        log_model(model, 'model')
        model_uri = mlflow.get_artifact_uri(artifact_path='model')
        logger.info(f'Model successfully trained and saved @ {model_uri}')
Exemplo n.º 30
0
from pyspark.ml.tuning import CrossValidator
from mmlspark import LightGBMRegressor

# today_str = datetime.date.today().strftime("%Y%m%d")
today_str = "20190126"


def mae_metric(y_true, predict):
    mae = mean_absolute_error(y_true, predict)
    return 1 / (1 + mae)


my_score = make_scorer(mae_metric, greater_is_better=True)
train_path = "E:/lgb/train_" + today_str + ".csv"
test_path = "E:/lgb/test_" + today_str + ".csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

X_train, y_train = train_df.drop(['user_id', 'target'],
                                 axis=1), train_df['target']
X_test = test_df.drop(['user_id'], axis=1).values
X_train = X_train.values
y_train = y_train.values

lgb = LightGBMRegressor(objective="quantile",
                        alpha=0.2,
                        learningRate=0.01,
                        numLeaves=31)

cv = CrossValidator(estimator=lgb, numFolds=5)
Exemplo n.º 31
0
 .addGrid(als.rank, [10, 50, 75, 100]) \
 .addGrid(als.maxIter, [5, 50, 75, 100]) \
 .addGrid(als.regParam, [.01, .05, .1, .15]) \
 .build()
# Define evaluator as RMSE
evaluator = RegressionEvaluator(metricName = "rmse", 
 labelCol = "rating", 
 predictionCol = "prediction")
# Print length of evaluator
print ("Num models to be tested using param_grid: ", len(param_grid))

# COMMAND ----------

# Build cross validation using CrossValidator
cv = CrossValidator(estimator = als, 
 estimatorParamMaps = param_grid, 
 evaluator = evaluator, 
 numFolds = 5)
model = als.fit(training)
predictions = model.transform(test)
predictions.show(n = 10)

# COMMAND ----------

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# COMMAND ----------

# Generate n recommendations for all users
ALS_recommendations = model.recommendForAllUsers(numItems = 10) # n — 10
ALS_recommendations.show(n = 10)
# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 0.5, 1.0, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5, 0.8, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20])
             .build())

# COMMAND ----------

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)

# COMMAND ----------

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# COMMAND ----------

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)
Exemplo n.º 33
0
#trainingDataDF, testingDataDF = trainingData2.randomSplit([0.8, 0.2], seed=0L)

# COMMAND ----------

pipeline = Pipeline(stages=[glm])
pipeline2 = Pipeline(stages=[rfr])

# COMMAND ----------

paramGrid = ParamGridBuilder().addGrid(glm.maxIter, [8, 10, 12]).addGrid(glm.regParam, [0.4, 0.6, 0.8]).build()
paramGrid2 = ParamGridBuilder().addGrid(rfr.maxDepth, [20, 25]).addGrid(rfr.maxBins, [32, 48]).build()

# COMMAND ----------

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(metricName = "mae"), numFolds=5)
crossval2 = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid2, evaluator=RegressionEvaluator(metricName = "mae"), numFolds=5)

# COMMAND ----------

trainingDataSJ = trainingData2.filter("city == 'sj'")
trainingDataIQ = trainingData2.filter("city == 'iq'")
testingDataSJ = testingData2.filter("city == 'sj'")
testingDataIQ = testingData2.filter("city == 'iq'")
#testingData2SJ = testingData2.filter("city == 'sj'")
#testingData2IQ = testingData2.filter("city == 'iq'")

# COMMAND ----------

cvModel = crossval2.fit(trainingDataSJ)#RFR
cvModel2 = crossval2.fit(trainingDataIQ)#RFR
# COMMAND ----------

#Define Pipeline
pipeline = Pipeline(stages=[
    Neighborhood_indexer, YearBuilt_indexer, MoSold_indexer, YrSold_indexer,
    assembler, lr
])

# COMMAND ----------

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.05, 0.01])\
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="label")
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)
cvModel = crossval.fit(train)

# COMMAND ----------

prediction = cvModel.transform(test)

# COMMAND ----------

display(prediction.selectExpr("id as  Id", "prediction as SalePrice"))

# COMMAND ----------
def random_forest_classifier(training_data, test_data, validation_data):
    # ROC: 0.73
    # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label')

    # ROC: 0.75
    # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=50, maxDepth=30, maxBins=32)

    # ROC: 0.75
    # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', weightCol='classWeights', numTrees=50,
    #                             maxDepth=30, maxBins=32)

    # ROC: 0.75
    # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=50, impurity='entropy',
    #                             maxDepth=30, maxBins=32)

    # ROC: 0.70
    # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=50, impurity='entropy',
    #                             maxDepth=30, maxBins=2)

    # ROC: 0.76
    # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=100,
    #                             maxDepth=30, maxBins=100)

    rf = RandomForestClassifier(featuresCol='scaled_features',
                                labelCol='label',
                                weightCol='classWeights',
                                numTrees=25,
                                maxDepth=5,
                                maxBins=32)

    rfModel = rf.fit(training_data)

    # print(rfModel.featureImportances)

    # Plot roc curve
    roc_plot(rfModel)

    predict_valid = rfModel.transform(validation_data)
    # predict_train = rfModel.transform(training_data)
    predict_valid.show(5)

    evaluate_metrics(predict_valid)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                              labelCol='label',
                                              metricName="areaUnderROC")

    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_valid,
                    data_type="valid_data")

    # predict_final = rfModel.transform(test_data)
    #
    # model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_final,
    #                 data_type="test_data")

    # print("\n\nParameter Grid and cross validation")
    paramGrid = ParamGridBuilder() \
        .addGrid(rf.maxDepth, [2, 4, 6]) \
        .addGrid(rf.maxBins, [20, 60]) \
        .addGrid(rf.numTrees, [5, 20]) \
        .build()

    # Create 5-fold CrossValidator
    cv = CrossValidator(estimator=rf,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)

    # Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
    cvModel = cv.fit(training_data)

    predict_cross_valid = cvModel.transform(validation_data)

    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_cross_valid,
                    data_type="valid_data")

    predict_final = cvModel.bestModel.transform(test_data)

    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_final,
                    data_type="test_data")


# K fold Cross Validation

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder()\
    .addGrid(logR.aggregationDepth,[2,5,10])\
    .addGrid(logR.elasticNetParam,[0.0, 0.5, 1.0])\
    .addGrid(logR.fitIntercept,[False, True])\
    .addGrid(logR.maxIter,[10, 100])\
    .addGrid(logR.regParam,[0.01, 0.5, 2.0]) \
    .build()

CV = CrossValidator(estimator= logR, estimatorParamMaps=paramGrid, evaluator=evaluator_AUC, numFolds= 5)
CVModel = CV.fit(train)

# Best Model 

Best_Logm = CVModel.bestModel


print(Best_Logm.coefficients)
print(Best_Logm.intercept)

predict_train_cv=CVModel.transform(train)
predict_test_cv=CVModel.transform(test)

predict_train_cv_pd = predict_train_cv.toPandas()
predict_test_cv_pd = predict_test_cv.toPandas()
def decision_tree_classifier(training_data, test_data, validation_data):
    # ROC 0.69
    # dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='label', maxDepth=3)

    # ROC 0.46
    # dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='label', maxDepth=10)

    # ROC 0.68
    dt = DecisionTreeClassifier(featuresCol='scaled_features',
                                labelCol='label',
                                maxDepth=3,
                                impurity='entropy')

    model = dt.fit(training_data)

    predict_valid = model.transform(validation_data)
    # predict_train = model.transform(training_data)
    # predict_valid.show(10)

    evaluate_metrics(predict_valid)

    predict_valid.select('*').show(10)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                              labelCol='label',
                                              metricName="areaUnderROC")

    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_valid,
                    data_type="valid_data")

    print("\n\nParameter Grid and cross validation")

    paramGrid = ParamGridBuilder() \
        .addGrid(dt.maxDepth, [1, 2, 6, 10]) \
        .addGrid(dt.maxBins, [20, 40, 80]) \
        .build()

    # Create 5-fold CrossValidator
    cv = CrossValidator(estimator=dt,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)

    # Run cross validations
    cvModel = cv.fit(training_data)

    print("numNodes = ", cvModel.bestModel.numNodes)
    print("depth = ", cvModel.bestModel.depth)

    # Use test set to measure the accuracy of the model on new data
    predict_cross_valid = cvModel.transform(validation_data)
    # cvModel uses the best model found from the Cross Validation
    # Evaluate best model

    # ROC 0.706, Slightly better than Logistic Regresion
    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_cross_valid,
                    data_type="valid_data")
Exemplo n.º 38
0
pandas_df = pd.DataFrame(X_train_t)
pandas_df["label"] = y_train
spark_df = spark.createDataFrame(pandas_df)
assembler = VectorAssembler(inputCols=[str(a) for a in pandas_df.columns[:-1]],
                            outputCol="features")

# spark ML logistic regression w/ grid seach
start = time.time()
lr = LR()
pipeline = Pipeline(stages=[assembler, lr])
paramGrid = ParamGridBuilder().addGrid(lr.regParam,
                                       [10.0, 1.0, 0.1, 0.01]).build()
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=MulticlassClassificationEvaluator(),
    numFolds=5,
    parallelism=8,
)
cvModel = crossval.fit(spark_df)
print("-- spark ML LR --")
print("Train Time: {0}".format(time.time() - start))
print("Best Model CV Score: {0}".format(np.mean(cvModel.avgMetrics)))

# test holdout
pandas_df = pd.DataFrame(X_test_t)
pandas_df["label"] = y_test
eval_df = spark.createDataFrame(pandas_df)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("Holdout F1: {0}".format(evaluator.evaluate(
    cvModel.transform(spark_df))))
def weighted_logistic_regression(training_data, test_data, validation_data):
    # ROC: 0.69
    lr = LogisticRegression(featuresCol='scaled_features',
                            labelCol='label',
                            weightCol='classWeights',
                            maxIter=100)
    # Always the same prediction. ROC: 0.5
    # lr = LogisticRegression(featuresCol='scaled_features',labelCol='label',maxIter=100, regParam=0.3, elasticNetParam=0.8)
    # A little better, still not very good. ROC: 0.663
    # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=100, regParam=0.1,
    #                         elasticNetParam=0.8)
    # ROC: 0.60
    # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=20, regParam=0.01,
    #                         elasticNetParam=0.3)
    # ROC: 0.61
    # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=20, regParam=1e-10,
    #                         elasticNetParam=0.2)
    # ROC 0.65. It seems that in the most iterations things are getting improved
    # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=100, regParam=1e-10,
    #                         elasticNetParam=0.2)
    # ROC: 0.66 The best one. However, to improve we must move to other choices
    # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label',weightCol='classWeights', maxIter=100, regParam=0.01,
    #                         elasticNetParam=0.2)

    # Train model using training Data
    model = lr.fit(training_data)

    metric_plotting(model)

    print("\nCoefficients: " + str(model.coefficients))
    print("Intercept: " + str(model.intercept))

    # Make predictions on test data using the transform method
    # LogisticRegression.transform() will only use the features column
    # predict_train = model.transform(training_data)
    # predict_test = model.transform(test_data)
    predict_valid = model.transform(validation_data)

    evaluate_metrics(predict_valid)

    # View the predictions
    predict_valid.select('*').show(10)

    # Evaluate the model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                              labelCol='label',
                                              metricName="areaUnderROC")

    # After experimenting and reading more about different metrics, areaUnderROC metric seems to be proper for our purposes
    # evaluator3 = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label',
    #                                            metricName="areaUnderPR")
    # evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy')

    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_valid,
                    data_type="valid_data")
    # model_evaluator(evaluator=evaluator2, evaluator_name="accuracy", data=predict_valid, data_type="valid_data")
    # model_evaluator(evaluator=evaluator3, evaluator_name="areaUnderPR", data=predict_valid, data_type="valid_data")

    # Create ParamGrid for Cross Validation
    # This grid takes a while. Choose another one for the next implementation
    # ROC: 0.694. Took for ever with these parameters. Execute with different parameters.
    # This seems to be the best we are going to get with this model
    # paramGrid = ParamGridBuilder() \
    #     .addGrid(lr.aggregationDepth, [2, 5, 10]) \
    #     .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    #     .addGrid(lr.fitIntercept, [False, True]) \
    #     .addGrid(lr.maxIter, [10, 100, 1000]) \
    #     .addGrid(lr.regParam, [0.01, 0.5, 2.0]) \
    #     .build()

    print("\n\nParameter Grid and cross validation")
    # ROC: 0.694
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.aggregationDepth, [2, 5, 10]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .addGrid(lr.fitIntercept, [False, True]) \
        .addGrid(lr.maxIter, [20, 50, 100]) \
        .addGrid(lr.regParam, [0.01, 0.2, 1.0]) \
        .build()

    # Create 5-fold CrossValidator
    cv = CrossValidator(estimator=lr,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)
    # Run cross validations
    cvModel = cv.fit(training_data)

    # predict_train = cvModel.transform(training_data)
    predict_cross_valid = cvModel.transform(validation_data)
    model_evaluator(evaluator=evaluator,
                    evaluator_name="areaUnderROC",
                    data=predict_cross_valid,
                    data_type="valid_data")
])
evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections",
                                              predictionCol="prediction",
                                              metricName="accuracy")

print("Configuring CrossValidation")
params = ParamGridBuilder() \
   .addGrid(categorical_hasher.numFeatures, [2048]) \
   .addGrid(regression.fitIntercept, [True]) \
   .addGrid(regression.maxIter, [100]) \
   .addGrid(regression.threshold, [0.5]) \
   .addGrid(regression.standardization, [False]) \
   .build()

validator = CrossValidator(estimator=pipeline,
                           estimatorParamMaps=params,
                           evaluator=evaluator,
                           numFolds=3)

print("Fitting -> Training Data")
pipeline_model = validator.fit(train)

print("Fitting -> Test Data")
predictions = pipeline_model.transform(test)
predictions.select("HasDetections", "MachineIdentifier", "probability",
                   "prediction").show(truncate=False)

print("Computing Accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = {0}".format(accuracy))

print("Saving Pipeline Model")
    # Generate top 10 movie recommendations for a specified set of users
    # 取3个用户,保留userIdInt列
    print('==============recommendForUserSubset==============')
    users = ratingSamples.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)  # 仅为这三个用户推荐电影
    userSubsetRecs.show(5, False)

    # Generate top 10 user recommendations for a specified set of movies
    # 取3个电影,保留movieIdInt列
    print('==============recommendForUserSubset==============')
    movies = ratingSamples.select(als.getItemCol()).distinct().limit(3)
    movieSubSetRecs = model.recommendForItemSubset(movies, 10)  # 仅为3个电影推荐用户
    movieSubSetRecs.show(5, False)

    # 模型超参数搜索,利用K折交叉验证
    print('==============CrossValidator==============')
    paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build()
    cv = CrossValidator(estimator=als,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)
    cvModel = cv.fit(ratingSamples)

    # 所有超参数枚举的平均误差
    avgMetrics = cvModel.avgMetrics
    print('avgMetrics:', avgMetrics)
    # 最佳模型
    print('bestModel:', cvModel.bestModel)
    cvModel.bestModel.recommendForAllUsers(10).show(10, False)

    spark.stop()
Exemplo n.º 42
0
# Building a RF model

rf = RandomForestRegressor(labelCol="x4", featuresCol="indexedFeatures",numTrees=3, maxDepth=29, maxBins=32, featureSubsetStrategy="auto")

# Pipeline

pipeline = Pipeline(stages=[featureIndexer, rf])

# Cross Validation

paramGrid = ParamGridBuilder().build()

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="x4",metricName="rmse")
evaluator2 = RegressionEvaluator(predictionCol="prediction", labelCol="x4",metricName="r2")

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # 5 fold CV

cvModel = cv.fit(trainDF)
cvModel1 = cv.fit(trainingData)

predict = cvModel.transform(testDF)
file1 = open('predictionFile.txt','w')
predict.select("prediction").show(10)
file1.write('\n'.join(list(map(str,predict.select("prediction").collect()))))

predict_cv = cvModel1.transform(testData)

rmse = evaluator.evaluate(predict_cv)

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity")
pipeline = Pipeline().setStages([glr])
params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build()
evaluator = RegressionEvaluator()\
  .setMetricName("rmse")\
  .setPredictionCol("prediction")\
  .setLabelCol("label")
cv = CrossValidator()\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)\
  .setEstimatorParamMaps(params)\
  .setNumFolds(2) # should always be 3 or more but this dataset is small
model = cv.fit(df)


# COMMAND ----------

from pyspark.mllib.evaluation import RegressionMetrics
out = model.transform(df)\
  .select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = RegressionMetrics(out)
print "MSE: " + str(metrics.meanSquaredError)
print "RMSE: " + str(metrics.rootMeanSquaredError)
print "R-squared: " + str(metrics.r2)
print "MAE: " + str(metrics.meanAbsoluteError)
Exemplo n.º 44
0
#Further, we incure cost of bias and overfitting due to aritifically balancing our training set.
#88% of ads appearing in both sets. So, a little bit overfit cost will not be crutial.

train = train.sampleBy('label', fractions={0: .24, 1: 1.0}).cache()
rf = RandomForestClassifier()

stratified_CV_data = CV_data.sampleBy('Churn',
                                      fractions={
                                          0: 388. / 2278,
                                          1: 1.0
                                      }).cache()

#TODO: Add ParamGrid
grid = ParamGridBuilder().addGrid(rf.maxDepth, [3, 5, 8]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5)
cvModel = cv.fit(train)
score = evaluator.evaluate(cvModel.transform(train))

#Saving the model
bestModel = cvModel.bestModel
os.system('mkdir rf')
os.chdir(os.getcwd() + '/rf')
bestModel.save(os.getcwd() + '/rfModel')
np.save('score', score)

# TODO: Make predictions on test_transformed and use APK
Exemplo n.º 45
0
                           baggingFraction=0.7
                           )

evaluator = BinaryClassificationEvaluator(rawPredictionCol="features",
                                          labelCol="label",
                                          metricName="areaUnderPR")

paramGrid = ParamGridBuilder()\
    .addGrid(model.maxDepth, [9, 13]) \
    .addGrid(model.featureFraction, [0.9, 0.7, 0.5]) \
    .build()

print('Creamos cross-validador con {} folds'.format(kFolds))
crossval = CrossValidator(
    estimator=model,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=kFolds,
    seed=1)

print('Entrenando modelo...')
model_trained = crossval.fit(train_data)
print('Fin del train')

try:
    for i, j in zip(model_trained.avgMetrics, paramGrid):
        print("Score {} con los parametros {}".format(i, j))
except:
    print("No se pudo hacer zip(model.avgMetrics, paramGrid)")

# FEATURES IMPORTANCES
for i, j in zip(model_trained.bestModel.getFeatureImportances(), train_cols):
# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = dfFinal.randomSplit([0.8, 0.2])


# Train the model.
#rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures")

#pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, nb])
paramGrid = ParamGridBuilder().build()

crossval = CrossValidator(
     estimator=pipeline,
     estimatorParamMaps=paramGrid,
     evaluator=BinaryClassificationEvaluator(),
     numFolds=5)
     
model = crossval.fit(trainingData)

# Compute raw scores on the test set
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(5)
rddPredictions = predictions.select("prediction", "indexedLabel").rdd
accuracy = rddPredictions.filter(lambda p: (p['prediction'] == p['indexedLabel'])).count() / float(testData.count())
TP = rddPredictions.filter(lambda p: (p['prediction'] == 1 and p['prediction'] == p['indexedLabel'])).count()
TN = rddPredictions.filter(lambda p: (p['prediction'] == 0 and p['prediction'] == p['indexedLabel'])).count()
FP = rddPredictions.filter(lambda p: (p['indexedLabel'] == 1 and p['prediction'] != p['indexedLabel'])).count()
FN = rddPredictions.filter(lambda p: (p['indexedLabel'] == 0 and p['prediction'] != p['indexedLabel'])).count()
print("TP = ", TP)
Exemplo n.º 47
0
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))


#****************************************************************
#*********************CROSS VALIDATION: 80%/20%******************
#*******************Model: logistic regression*******************
#*****************************************************************

#create a data frame from an RDD -> features must be Vectors.sparse from pyspark.mllib.linalg
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(labeledRDD, ['features','label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
dfTrain.show()
#choose estimator and grid
lr = LogisticRegression()	#choose the model
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()	
#the grid is built to find the best paramter 'alpha' for the regularization of the model. It is an elastic net
#alpha=0, for a L2 regularization, 
#alpha=1, for a L1 regularization
print "Start Cross validation"

evaluator = BinaryClassificationEvaluator()	#choose the evaluator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter
cvModel = cv.fit(dfTrain)	#train the model on the whole training set
resultat=evaluator.evaluate(cvModel.transform(dfTest))	#compute the percentage of success on test set
print "Percentage of correct predicted labels (0-1): ",resultat
vectorizedData = training_data.toDF()
print("Creating MultilayerPerceptronClassifier...")
MLP = MultilayerPerceptronClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures')
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(vectorizedData)
featureIndexer = VectorIndexer(inputCol='features',
                               outputCol='indexedFeatures',
                               maxCategories=2).fit(data.toDF())
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, MLP])

paramGrid_MLP = ParamGridBuilder().addGrid(MLP.layers,[[3072, neuron, 10] for neuron in [200, 500]]).build()
evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel',
                                      predictionCol='prediction', metricName='f1')
print("Processing crossvalidation with 3-fold & 200/500 hidden layer units")
crossval = CrossValidator(estimator=pipeline,
                  estimatorParamMaps=paramGrid_MLP,
                  evaluator=evaluator,
                  numFolds=3)
starttime = datetime.datetime.now()
CV_model = crossval.fit(vectorizedData)
print CV_model.bestModel.stages[2]
print('Done on fitting model:%s'%(datetime.datetime.now()-starttime))

print("Transforming testing data...")
vectorized_test_data = testing_data.toDF()

#transformed_data1 = CV_model.transform(vectorizedData)
#print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1)
transformed_data = CV_model.transform(vectorized_test_data)
#print transformed_data.first()
print("Fitting testing data into model...")
print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data)
                              .addGrid(lr.tol, (1e-4, 1e-5))\
                              .addGrid(lr.elasticNetParam, (0.25,0.75))\
                              .build() 

# DEFINE PIPELINE 
# SIMPLY THE MODEL HERE, WITHOUT TRANSFORMATIONS
pipeline = Pipeline(stages=[lr])

# DEFINE CV WITH PARAMETER SWEEP
# splitting the dataset into a set of folds which are used as separate training and test datasets
# generate 3 (training, test) dataset pairs, each of which uses 2/3 of 
# the data for training and 1/3 for testing
# 8 params x 3 folds
# See: https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation
cv = CrossValidator(estimator= lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=RegressionEvaluator(),
                    numFolds=3)

# CONVERT TO DATA FRAME, AS CROSSVALIDATOR WON'T RUN ON RDDS
#trainDataFrame = sqlContext.createDataFrame(oneHotTRAINreg, ["features", "label"])

# TRAIN WITH CROSS-VALIDATION
#cv_model = cv.fit(trainDataFrame)
cv_model = cv.fit(trainReg.toDF(['label','features']))


# EVALUATE MODEL ON TEST SET
#testDataFrame = sqlContext.createDataFrame(oneHotTESTreg, ["features", "label"])
testDataFrame = testReg.toDF(['label','features'])

# MAKE PREDICTIONS ON TEST DOCUMENTS
# MAGIC We will create a 5-fold cross validator.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5, 2.0]).addGrid(
    lr.elasticNetParam, [0.0, 0.5, 1.0]).addGrid(lr.maxIter,
                                                 [1, 5, 10]).build())

# COMMAND ----------

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

# COMMAND ----------

# Use test set to measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# COMMAND ----------

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
Exemplo n.º 51
0
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")

# COMMAND ----------

rfmodel = RandomForestClassifier()\
  .setLabelCol("label")\
  .setFeaturesCol("features")
#print (rfmodel.explainParams())

# COMMAND ----------

paramGrid = ParamGridBuilder().addGrid(rfmodel.maxBins, [10,20]).addGrid(rfmodel.maxDepth, [5,10]).build()
pipeline = Pipeline().setStages([assembler,rfmodel])
evaluator = MulticlassClassificationEvaluator()
tvs = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(4)

# COMMAND ----------

training, test = new_data.randomSplit([0.75, 0.25], seed = 12345)
model = tvs.fit(training)

# COMMAND ----------

from pyspark.sql import Row
newtest = Row(sepal_length=3.50, sepal_width=1.0, petal_length=2.00, petal_width=0.30)
df4 = sc.parallelize([newtest]).toDF()
dff = model.transform(df4)
display(dff)

# COMMAND ----------
Exemplo n.º 52
0
 def RandomForestClassifier(self):
   print("********************************************************************************************************************************************")
   print("Random Forest")
   self.t0 = time()
   rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy")
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
   svm = LinearSVC(maxIter=5, regParam=0.01)
   LSVC = LinearSVC()
   ovr = OneVsRest(classifier=LSVC)
   paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build()
   crossval = CrossValidator(estimator=ovr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=MulticlassClassificationEvaluator(metricName="f1"),
                                  numFolds=2)
   Train_sparkframe = self.trainingData.select("features", "label")
   cvModel = crossval.fit(Train_sparkframe)
   bestModel = cvModel.bestModel