示例#1
0
def cv_metrics(model: CrossValidatorModel, output_dir: Path = None) -> dict:
    num_folds = model.getNumFolds()
    evaluator_metric = model.getEvaluator().getMetricName()

    param_maps = model.getEstimatorParamMaps()
    cv_avg_metrics = model.avgMetrics

    grid = []
    for p, m in zip(param_maps, cv_avg_metrics):
        grid_item = {
            "params":
            dict([(str(param), value) for param, value in p.items()]),
            evaluator_metric: m
        }
        grid.append(grid_item)

    metrics = {
        "cross_validation_metrics": {
            "num_folds": num_folds,
            "evaluator_metric": evaluator_metric,
            "grid_search": grid
        }
    }

    if output_dir is not None:
        save_metrics(metrics, output_dir / "cv_metrics.json")

    return metrics
示例#2
0
    def _run_test_save_load_trained_model(self, LogisticRegressionCls,
                                          LogisticRegressionModelCls):
        # This tests saving and loading the trained model only.
        # Save/load for CrossValidator will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )
        lr = LogisticRegressionCls()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(
            estimator=lr,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            collectSubModels=True,
            numFolds=4,
            seed=42,
        )
        cvModel = cv.fit(dataset)
        lrModel = cvModel.bestModel

        lrModelPath = temp_path + "/lrModel"
        lrModel.save(lrModelPath)
        loadedLrModel = LogisticRegressionModelCls.load(lrModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)

        # SPARK-32092: Saving and then loading CrossValidatorModel should not change the params
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedCvModel = CrossValidatorModel.load(cvModelPath)
        for param in [
                lambda x: x.getNumFolds(),
                lambda x: x.getFoldCol(),
                lambda x: x.getSeed(),
                lambda x: len(x.subModels),
        ]:
            self.assertEqual(param(cvModel), param(loadedCvModel))

        self.assertTrue(
            all(loadedCvModel.isSet(param) for param in loadedCvModel.params))

        # mimic old version CrossValidatorModel (without stdMetrics attribute)
        # test loading model backwards compatibility
        cvModel2 = cvModel.copy()
        cvModel2.stdMetrics = []
        cvModelPath2 = temp_path + "/cvModel2"
        cvModel2.save(cvModelPath2)
        loadedCvModel2 = CrossValidatorModel.load(cvModelPath2)
        assert loadedCvModel2.stdMetrics == []
示例#3
0
    def fit(self, dataset):
        java_estimator, java_epms, java_evaluator = self._to_java_impl()
        self._java_obj.setEstimator(java_estimator)
        self._java_obj.setEvaluator(java_evaluator)
        self._java_obj.setEstimatorParamMaps(java_epms)

        cv_java_model = self._java_obj.fit(dataset._jdf)
        cv_py_model = CrossValidatorModel._from_java(cv_java_model)
        xgbModel = self.getEstimator()._create_model(cv_java_model.bestModel())
        # return CrossValidatorModel
        return CrossValidatorModel(xgbModel, cv_py_model.avgMetrics, cv_py_model.subModels)
示例#4
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(
            estimator=lr,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            numFolds=numFolds,
            collectSubModels=True,
        )

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels",
                               "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid,
                                 cvModel3.subModels[i][j].uid)
示例#5
0
def DecisionTree(data):
    path = 'modelo_DecisionTree/modelDecisionTree'
    DecisionTree = CrossValidatorModel.load(path)
    predictions = DecisionTree.transform(data)
    print("DECISION TREE")
    predictions.select('Email', 'Identificador', 'Burnout_Antes', 'prediction',
                       'probability').show(truncate=False)
示例#6
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        metrics = [0.0] * numModels

        stratified_data = self.stratify_data(dataset)

        for i in range(nFolds):
            train_arr = [x for j, x in enumerate(stratified_data) if j != i]
            train = reduce((lambda x, y: x.unionAll(y)), train_arr)
            validation = stratified_data[i]

            models = est.fit(train, epm)

            for j in range(numModels):
                model = models[j]
                metric = eva.evaluate(model.transform(validation, epm[j]))
                metrics[j] += metric / nFolds

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))
示例#7
0
def regression(df, column, name):
    try:
        model = CrossValidatorModel.load("data/{}.model".format(name))
    except:

        LR = LogisticRegression(labelCol="label",
                                featuresCol="features",
                                maxIter=10)
        if name[3] == 'P':
            LR.setThreshold(0.2)
        else:
            LR.setThreshold(0.25)

        eval = BinaryClassificationEvaluator()
        paramGrid = ParamGridBuilder().addGrid(LR.regParam, [1.0]).build()
        crossval = CrossValidator(estimator=LR,
                                  evaluator=eval,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=5)
        # train, test = df.select('features', func.col(column).alias("label")).randomSplit([0.5, 0.5])
        print("Training '{}' classifier... Please wait".format(name))

        model = crossval.fit(df.select("*", func.col(column).alias("label")))
        model.save("data/{}.model".format(name))
    # df_test = model.transform(df)
    # df_test.filter(df_test.prediction == 1).show()
    return model
示例#8
0
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
示例#9
0
def load_model(model_path):
    """
    Load a pretrained model
    """
    model = CrossValidatorModel.load(model_path)
    
    return model
示例#10
0
    def _fit(self, dataset):
        est = self.estimator
        epm = self.estimatorParamMaps
        numModels = len(epm)
        eva = self.evaluator
        metricName = eva.getMetricName()
        nFolds = self.numFolds
        metrics = [0.0] * numModels
        stratified_data = self.stratify_data(dataset)

        for i in range(nFolds):

            print(f"Initiating Training for fold {i + 1}")

            train = stratified_data.filter(stratified_data["bucket_fold"] != i)
            validation = stratified_data.filter(
                stratified_data["bucket_fold"] == i)

            models = est.fit(train, epm)

            for j in range(numModels):
                model = models[j]
                metric = eva.evaluate(model.transform(validation, epm[j]))
                metrics[j] += metric / nFolds

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))
示例#11
0
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        metricName = eva.getMetricName()

        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds

        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        for i in range(nFolds):
            foldNum = i + 1
            print("Comparing models on fold %d" % foldNum)

            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] <
                                                       validateUB)
            validation = df.filter(condition)
            train = df.filter(~condition)

            for j in range(numModels):
                paramMap = epm[j]
                model = est.fit(train, paramMap)
                # TODO: duplicate evaluator to take extra params from input
                metric = eva.evaluate(model.transform(validation, paramMap))
                metrics[j] += metric

                avgSoFar = metrics[j] / foldNum
                print("params: %s\t%s: %f\tavg: %f" %
                      ({param.name: val
                        for (param, val) in paramMap.items()
                        }, metricName, metric, avgSoFar))

                for (param, val) in paramMap.items():
                    a.append(param.name)
                    b.append(val)
                    c.append(metric)

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestParams = epm[bestIndex]
        bestModel = est.fit(dataset, bestParams)
        avgMetrics = [m / nFolds for m in metrics]
        bestAvg = avgMetrics[bestIndex]
        print("Best model:\nparams: %s\t%s: %f" %
              ({param.name: val
                for (param, val) in bestParams.items()}, metricName, bestAvg))

        return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))
def process(spark, input_path, output_file):

    ads_test = spark.read.parquet(input_path)

    # Загрузка модели, скоринг и запись
    output = CrossValidatorModel.load('spark_ml_model').transform(ads_test)
    cols = ['ad_id', 'prediction']
    output = output.select(*cols)
    output.write.option('header', 'true').csv(str(output_file))
示例#14
0
def LinearEvaluation(data):
    path = 'modelo_LogisticRegression/modelLogisticRegression'
    lrModel = CrossValidatorModel.load(path)
    #print(lrModel.coefficientMatrix)
    #predictions=lrModel.transform(data)
    predictions = lrModel.transform(data)  #VERDADERO = 0 Y FALSO 1
    print("LINEAR EVALUATION")
    predictions.select('Email', 'Identificador', 'Burnout_Antes', 'prediction',
                       'probability').show(truncate=False)
示例#15
0
def main(context):
    """Main function takes a Spark SQL context."""
    comments_df = context.read.parquet("comments.parquet")
    submissions_df = context.read.parquet("submissions.parquet")
    labeled_data_df = context.read.parquet("labeled_data.parquet")

    if path.exists("df_label.parquet"):
        labeled_df = context.read.parquet("df_label.parquet")
        comments_df = cleanedCommentDF(comments_df)

    else:
        labeled_df = createLabeledDF(comments_df, labeled_data_df)
        comments_df = cleanedCommentsDF(comments_df)

    if path.exists("cvModel"):
        cvModel = CountVectorizerModel.load("cvModel")
        posModel = CrossValidatorModel.load("pos.model")
        negModel = CrossValidatorModel.load("neg.model")
    else:
        cvModel, posModel, negModel = train(labeled_df)

    # the "final join" without actually joining!
    output = cvModel.transform(comments_df)
    output = output.drop('score')
    output = output.drop('ngrams_combined')
    output = output.drop('link_id_cleaned')
    posResult = posModel.transform(output)
    posResult = posResult.drop('rawPrediction')
    posResult = posResult.drop('prediction')
    posResult = posResult.withColumnRenamed('probability', 'pos_prob')
    fullResult = negModel.transform(posResult)
    fullResult = fullResult.withColumnRenamed('probability', 'neg_prob')
    fullResult = fullResult.drop('rawPrediction')
    fullResult = fullResult.drop('prediction')
    fullResult = fullResult.withColumn(
        'neg',
        when(get_probability_udf(fullResult.neg_prob) > 0.25, 1).otherwise(0))
    fullResult = fullResult.withColumn(
        'pos',
        when(get_probability_udf(fullResult.pos_prob) > 0.2, 1).otherwise(0))
    fullResult.write.parquet("resulting_df.parquet")
    #fullResult_df = context.read.parquet("resulting_df.parquet")

    print(fullResult.count())
示例#16
0
def task9(context, entire_file, model):
    posModel = CrossValidatorModel.load("project2/pos.model")
    negModel = CrossValidatorModel.load("project2/neg.model")
    data = task45(context, entire_file)
    rr_data = model.transform(data)
    rr_data.createOrReplaceTempView("Data")
    new_data = context.sql(
        "SELECT * FROM Data WHERE Data.body NOT LIKE '%&gt%' OR Data.body NOT LIKE '%/s%'"
    )
    posResult = posModel.transform(new_data)
    posResult.createOrReplaceTempView("Pos")
    new_data_with_pos = context.sql(
        "SELECT author_flair_text, created_utc, submissionsScore, commentsScore, title, id, body, grams, count_vectors, probability as prob_pos FROM Pos"
    )
    result = negModel.transform(new_data_with_pos)
    result.createOrReplaceTempView("Result")
    formatted_result = context.sql(
        "SELECT author_flair_text, created_utc, title, submissionsScore, commentsScore, id, body, grams, count_vectors, prob_pos, probability as prob_neg FROM Result"
    )

    # Used this link to get the first element in the probability column
    # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability?noredirect=1&lq=1
    firstelement = udf(lambda v: float(v[1]), FloatType())

    pos_udf = udf(getPosProbs, IntegerType())
    neg_udf = udf(getNegProbs, IntegerType())

    res = formatted_result.select(firstelement('prob_neg'),
                                  firstelement('prob_pos'),
                                  'author_flair_text', 'created_utc', 'title',
                                  'id', 'body', 'count_vectors',
                                  'commentsScore', 'submissionsScore')
    nResult = res.withColumn("pos", pos_udf(res["<lambda>(prob_pos)"]))
    new_result = nResult.withColumn("neg",
                                    neg_udf(nResult["<lambda>(prob_neg)"]))

    new_result.createOrReplaceTempView("NewResult")

    actual_result = context.sql(
        "SELECT author_flair_text, created_utc, title, id, body, count_vectors, commentsScore, submissionsScore, pos, neg FROM NewResult"
    )

    #new_data.show(n=10)
    return actual_result
示例#17
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def main(sqlContext):

    # Check if the parquet file has already been written
    if not os.path.exists("superbowl_comments.parquet"):
        write_parquet(sqlContext)

    # Read the parquets
    comments = sqlContext.read.parquet("superbowl_comments.parquet")
    comments.registerTempTable("commentsTable")

    # Read the labels csv
    labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labels.csv")
    labels.registerTempTable("labelsTable")

    # Create Dataframe to Train the Model
    modelDataframe = sqlContext.sql("SELECT commentsTable.comments_id AS id, commentsTable.comments_body AS body, commentsTable.comments_author AS author, commentsTable.comments_created_utc AS created_utc, commentsTable.comments_subreddit_id AS subreddit_id, commentsTable.comments_link_id AS link_id, commentsTable.comments_parent_id AS parent_id, commentsTable.comments_score AS score, commentsTable.comments_controversiality AS controversiality, commentsTable.comments_gilded AS gilded FROM commentsTable INNER JOIN labelsTable ON commentsTable.comments_id = labelsTable.label_id")
    modelDataframe = create_dataframe(sqlContext, modelDataframe)

    # Fit the CountVectorizer model
    if(not os.path.exists("models/cvModel")):
        train_cv_model(modelDataframe)

    # Use model to transform the data
    modelDataframe = transform_model(sqlContext, modelDataframe)
    modelDataframe.registerTempTable("modelDataframeTable")
    modelDataframe = sqlContext.sql("SELECT modelDataframeTable.*, IF(labelsTable.label=1, 1, 0) AS pos_label, IF(labelsTable.label=-1, 1, 0) AS neg_label FROM modelDataframeTable INNER JOIN labelsTable ON modelDataframeTable.id = labelsTable.label_id")

    if(not os.path.exists("models/negModel") or not os.path.exists("models/posModel")):
        create_models(sqlContext, modelDataframe)

    # Load the positive and negative models back in
    posModel = CrossValidatorModel.load("models/posModel")
    negModel = CrossValidatorModel.load("models/negModel")

    if(not os.path.exists("fullDataframe.parquet")):
        create_fullDataframe(sqlContext, comments)

    # Load the full dataframe back in
    fullDataframe = sqlContext.read.parquet("fullDataframe.parquet")
    fullDataframe.registerTempTable("fullDataframeTable")

    # Get rid of comments that are sarcastic or removed
    fullDataframe = sqlContext.sql("SELECT * FROM fullDataframeTable WHERE fullDataframeTable.body NOT LIKE '%/s%' AND fullDataframeTable.body NOT LIKE '&gt%' AND fullDataframeTable.body NOT LIKE '%[removed]%'")
示例#19
0
def RandomForest(data):
    path = 'modelo_RandomForest/modelRandomForest'
    randomModel = CrossValidatorModel.load(path)
    predictions = randomModel.transform(data)
    prediccion = predictions.select('prediction', 'probability').rdd.flatMap(lambda x: x).collect()
    if prediccion[0] == 1.0:
        prediccionLabel = 'FALSO'
    else:
        prediccionLabel = 'VERDADERO'

    return prediccionLabel, prediccion[1][0] * 100
示例#20
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", F.rand(seed).alias(randCol))
        metrics = np.zeros((numModels, nFolds))

        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        subModels = None
        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [[None for j in range(numModels)]
                         for i in range(nFolds)]

        for i in range(nFolds):
            if self.sequentialIndex:
                pass
                # todo pass a column name to base the split on. make sure the split conforms to sklearn norms.
                # idx = [1,2,3,4]
                # training.where(~col("id").isin(idx)).show()
            else:
                validateLB = i * h
                validateUB = (i + 1) * h
                condition = (df[randCol] >= validateLB) & (df[randCol] <
                                                           validateUB)
                validation = df.filter(condition).cache()
                train = df.filter(~condition).cache()

            tasks = self._parallelFitTasks(est, train, eva, validation, epm,
                                           collectSubModelsParam)
            for j, metric, subModel in pool.imap_unordered(
                    lambda f: f(), tasks):
                metrics[j, i] = metric
                if collectSubModelsParam:
                    subModels[i][j] = subModel

            validation.unpersist()
            train.unpersist()

        avgMetrics = np.mean(metrics, axis=1)

        if eva.isLargerBetter():
            bestIndex = np.argmax(avgMetrics)
        else:
            bestIndex = np.argmin(avgMetrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(
            CrossValidatorModel(bestModel, avgMetrics.tolist(),
                                subModels)), metrics
示例#21
0
def LinearEvaluation(data):
    path = 'modelo_LogisticRegression/modelLogisticRegression'
    lrModel = CrossValidatorModel.load(path)
    predictions = lrModel.transform(data) #VERDADERO = 0 Y FALSO 1
    prediccion = predictions.select('prediction', 'probability').rdd.flatMap(lambda x: x).collect()
    if prediccion[0] == 1.0:
        prediccionLabel='FALSO'
    else:
        prediccionLabel='VERDADERO'

    return prediccionLabel,prediccion[1][0]*100
示例#22
0
 def predict(self, mysqldetails, method, churn_data_semifinal1):
     params_res = CrossValidatorModel.load(mname)
     predict_test = params_res.transform(data_test)
     X = churn_data_semifinal1
     if method.lower() == 'logistic_regression':
         params_res = CrossValidatorModel.load('./' + mysqldetails +
                                               '/model_logistic')
     elif method.lower() == 'naive_bayes':
         with open('./' + mysqldetails + '/model_naivebayes.pkl',
                   'rb') as f:
             params_res = pickle.load(f)
     #elif method.lower() == 'xgboost':
     #params_res=CrossValidatorModel.load('./model_xgboost')
     elif method.lower() == 'svm':
         params_res = CrossValidatorModel.load('./' + mysqldetails +
                                               '/model_svm')
     elif method.lower() == 'lightgbm':
         params_res = CrossValidatorModel.load('./' + mysqldetails +
                                               '/model_lightgbm')
     predict_test = params_res.transform(X)
     return predict_test.select('mobile', 'prediction')
示例#23
0
def DecisionTree(data):
    path = 'modelo_DecisionTree/modelDecisionTree'
    DecisionTree = CrossValidatorModel.load(path)
    predictions = DecisionTree.transform(data)
    prediccion = predictions.select('prediction', 'probability').rdd.flatMap(lambda x: x).collect()
    print(prediccion[0])
    if prediccion[0] == 1.0:
        prediccionLabel = 'FALSO'
    else:
        prediccionLabel = 'VERDADERO'

    return prediccionLabel, prediccion[1][0] * 100
示例#24
0
def task9(task6model):
    querytask9_0 = spark.sql(
        "SELECT id,comment_timestamp,title,state,comment_body FROM task8_table  WHERE comment_body NOT LIKE '&gt%' AND comment_body NOT LIKE '%/s%'"
    )
    querytask9_0.write.saveAsTable("task9_table1")
    querytask9_1 = spark.sql(
        "SELECT id, connect_all_string(sanitize(comment_body)) AS n_grams, comment_timestamp,title,state,comment_body  FROM task9_table1"
    )
    querytask9_2 = querytask9_1.select(
        split(col("n_grams"), ",\s*").alias("n_grams"), col("id"),
        col("comment_timestamp"), col("title"), col("state"),
        col("comment_body"))

    task9df = task6model.transform(querytask9_2)
    task9df.printSchema()
    task9df = task9df.write.saveAsTable("task9_table2")
    querytask9_3 = spark.sql(
        "SELECT id,  n_grams, comment_timestamp,title,state,comment_body, features, features AS features_backup  FROM task9_table2"
    )

    model_pos = CrossValidatorModel.load("www/pos.model")
    model_neg = CrossValidatorModel.load("www/neg.model")
    pos_ans = model_pos.transform(querytask9_3).write.saveAsTable("pos_table")

    task9df_withPos = spark.sql(
        "SELECT id,comment_timestamp,title,state,comment_body,prediction AS pos, features_backup AS features, probability AS pos_probability  FROM pos_table"
    )
    task9df_withPos.show()
    neg_ans = model_neg.transform(task9df_withPos).write.saveAsTable(
        "neg_table")

    task9result = spark.sql(
        "SELECT id,comment_timestamp,title,state,comment_body, pos , prediction AS neg FROM neg_table"
    )

    task9result.write.parquet("task9result_parquet")  #store parquet

    final_task9result = spark.read.parquet("task9result_parquet")
    final_task9result.write.saveAsTable("task9_table")
    spark.sql("SELECT * FROM task9_table").show()
示例#25
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        metrics = [0.0] * numModels

        # Use rolling window instead of random folds
        rowNumCol = self.uid + "_rownum"
        w = Window().orderBy(lit('A'))  # Dummy window to create row number
        df = dataset.select("*", row_number().over(w).alias(rowNumCol))
        h = df.count() / (nFolds + 1)

        pool = ThreadPool(processes=self.getParallelism())
        subModels = None
        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [[None for j in range(numModels)]
                         for i in range(nFolds)]

        for i in range(nFolds):
            # Get rolling (increasing) window
            validateLB = (i + 1) * h
            validateUB = (i + 2) * h
            validation = df.filter((df[rowNumCol] >= validateLB)
                                   & (df[rowNumCol] < validateUB)).cache()
            train = df.filter(df[rowNumCol] < validateLB).cache()

            tasks = _parallelFitTasks(est, train, eva, validation, epm,
                                      collectSubModelsParam)
            for j, metric, subModel in pool.imap_unordered(
                    lambda f: f(), tasks):
                metrics[j] += (metric / nFolds)
                if collectSubModelsParam:
                    subModels[i][j] = subModel

            validation.unpersist()
            train.unpersist()

        bestIndex = np.argmax(metrics)

        #if eva.isLargerBetter():
        #bestIndex = np.argmax(metrics)
        ##else:
        #bestIndex = np.argmin(metrics)

        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(
            CrossValidatorModel(bestModel, metrics, subModels))
示例#26
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)

        folds = dataset.select('fold').distinct().collect()
        nFolds = len(folds)

        metrics = [0.0] * numModels

        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        subModels = None

        allMetrics = [[None for i in range(nFolds)] for j in range(numModels)]

        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [[None for j in range(numModels)]
                         for i in range(nFolds)]

        for i in range(nFolds):

            validation = dataset.filter((dataset['fold'] == i)
                                        & (dataset['test'] == 1)).cache()
            train = dataset.filter((dataset['fold'] == i)
                                   & (dataset['test'] == 0)).cache()

            tasks = _parallelFitTasks(est, train, eva, validation, epm,
                                      collectSubModelsParam)
            for j, metric, subModel in pool.imap(lambda f: f(), tasks):

                allMetrics[j][i] = metric

                metrics[j] += (metric / nFolds)
                if collectSubModelsParam:
                    subModels[i][j] = subModel

            validation.unpersist()
            train.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])

        return (self._copyValues(
            CrossValidatorModel(bestModel, metrics, subModels)), allMetrics)
def loadModel(conf, path):
    """
        input : conf [dictionary], path [string]
        output: model [CrossValidatorModel, TrainValidationSplitModel, GeneralizedLinearRegressionModel]
    """
    if conf["tuning"]:
        if conf["tuning"].get("method").lower() == "crossval":
            loading_model = CrossValidatorModel.load(path)
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            loading_model = TrainValidationSplitModel.load(path)

    elif conf["tuning"] == None:
        loading_model = GeneralizedLinearRegressionModel.load(path)

    return loading_model
示例#28
0
def loadaftRegression(conf, path):
    '''Loading model from path.
       Input  : - Path
       Output : - Loaded model
    '''
  # Load model if use crossvalidation tuning
    if conf["tuning"].get("method") == "crossval" :
        loaded_model = CrossValidatorModel.load(path)   
  # Load model if use trainvalidationsplit tuning
    elif conf["tuning"].get("method") == "trainval":
        loaded_model = TrainValidationSplitModel.load(path)
  # Load model if non-tuning
    elif conf["tuning"].get("method") == None:
        loaded_model = AFTSurvivalRegressionModel.load(path)
    return loaded_model
示例#29
0
    def _fit(self, dataset):
        current_estimator = self.getEstimator()

        # Not a Pipeline, use standard CrossValidator
        if not isinstance(current_estimator, Pipeline):
            return super(DagCrossValidator, self)._fit(dataset)
        # Delegate parallelism to DagPipeline
        elif not isinstance(current_estimator, DagPipeline):
            dag_pipeline = DagPipeline(stages=current_estimator.getStages(),
                                       parallelism=self.getParallelism())
        # Already a DagPipeline
        else:
            dag_pipeline = current_estimator

        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] <
                                                       validateUB)
            validation = df.filter(condition).cache()
            train = df.filter(~condition).cache()

            fold_metrics = dag_pipeline.evaluate(epm, train, validation, eva)

            for j in range(len(metrics)):
                metrics[j] += fold_metrics[j] / nFolds

            validation.unpersist()
            train.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestModel = current_estimator.fit(dataset, epm[bestIndex])

        return self._copyValues(CrossValidatorModel(bestModel, metrics))
示例#30
0
def make_and_save_predictions(input_df, config):
    print("*" * 10)
    dt = input_df.select("refresh_date").first()[0]
    print("*" * 10)

    model = CrossValidatorModel.load(config["model_path"])
    final_output_df = model.transform(input_df).withColumnRenamed(
        "id", "JobID").withColumnRenamed("prediction", "JobDurationML")

    print(final_output_df.printSchema)
    prediction_path = config["prediction_path"] + "/year=" + str(
        dt.year) + "/month=" + str(dt.month) + "/day=" + str(dt.day)
    final_output_df.select(
        config["categorical_cols"] + config["numerical_cols"] +
        ["JobID", "JobDurationML"]).coalesce(1).write.mode('overwrite').option(
            "header", 'true').csv(prediction_path)
示例#31
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        dfp = df.toPandas()
        dfp = np.array_split(dfp, nFolds)

        train = self.spark.createDataFrame(data=dfp[0].round(3))
        for i in range(1, len(dfp) - 1):
            p = self.spark.createDataFrame(data=dfp[i].round(3))
            train = train.union(p)
        validation = self.spark.createDataFrame(data=dfp[-1].round(3))
        validation = validation.sort(validation.id.asc())
        train = train.sort(train.id.asc())

        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] <
                                                       validateUB)
            validation = df.filter(condition)
            validation = validation.sort(validation.id.asc())
            validation.show()
            train = df.filter(~condition)
            train = train.sort(train.id.asc())
            train.show()
            models = est.fit(train, epm)
            for j in range(numModels):
                model = models[j]
                # TODO: duplicate evaluator to take extra params from input
                metric = eva.evaluate(model.transform(validation, epm[j]))
                metrics[j] += metric / nFolds

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))
示例#32
0
    def _run_test_save_load_nested_estimator(self, LogisticRegressionCls):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )

        ova = OneVsRest(classifier=LogisticRegressionCls())
        lr1 = LogisticRegressionCls().setMaxIter(100)
        lr2 = LogisticRegressionCls().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=ova,
                            estimatorParamMaps=grid,
                            evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)

        originalParamMap = cv.getEstimatorParamMaps()
        loadedParamMap = loadedCV.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
示例#33
0
    def test_save_load_nested_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(100)
        lr2 = LogisticRegression().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)

        originalParamMap = cv.getEstimatorParamMaps()
        loadedParamMap = loadedCV.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)