예제 #1
0
    def random_forest(df, columns, input_col, **kwargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        columns = parse_columns(df, columns)

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = RandomForestClassifier(**kwargs)
        df.table()
        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
예제 #2
0
    def gbt(df, columns, input_col, **kwargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = GBTClassifier(**kwargs)

        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
예제 #3
0
def test_string_to_index_kargs():
    df = op.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
                                 ["id", "category"])

    df_indexed = fe.string_to_index(df, "category", stringOrderType="frequencyAsc")

    assert_spark_df(df_indexed)

    expected_collect = op.sc.parallelize([Row(id=0, category='a', category_index=2.0),
                                          Row(id=1, category='b', category_index=0.0),
                                          Row(id=2, category='c', category_index=1.0),
                                          Row(id=3, category='a', category_index=2.0),
                                          Row(id=4, category='a', category_index=2.0),
                                          Row(id=5, category='c', category_index=1.0)]).toDF()

    assert_equal(df_indexed.select("category", "category***INDEX_TO_STRING", "id").collect(), expected_collect.collect())
예제 #4
0
    def h2o_xgboost(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                                 featuresCols=columns,
                                 labelCol=label,
                                 **kwargs)
        model = h2o_xgboost.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
예제 #5
0
    def h2o_gbm(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_gbm = H2OGBM(ratio=0.8,
                         seed=1,
                         featuresCols=columns,
                         labelCol=label,
                         **kwargs)
        model = h2o_gbm.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
예제 #6
0
    def h2o_deeplearning(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_deeplearning = H2ODeepLearning(epochs=10,
                                           seed=1,
                                           l1=0.001,
                                           l2=0.0,
                                           hidden=[200, 200],
                                           featuresCols=columns,
                                           labelCol=label,
                                           **kwargs)
        model = h2o_deeplearning.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
예제 #7
0
    def h2o_automl(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        automl = H2OAutoML(
            convertUnknownCategoricalLevelsToNa=True,
            maxRuntimeSecs=60,  # 1 minutes
            seed=1,
            maxModels=3,
            labelCol=name_col(label, STRING_TO_INDEX),
            **kwargs)

        model = automl.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["value"] > 0.5,
                   1.0).otherwise(0.0))

        return df_pred, model