示例#1
0
    def gbt(df, columns, input_col, **kwargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = GBTClassifier(**kwargs)

        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
示例#2
0
    def random_forest(df, columns, input_col, **kwargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        columns = parse_columns(df, columns)

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = RandomForestClassifier(**kwargs)
        df.table()
        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
示例#3
0
    def h2o_xgboost(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                                 featuresCols=columns,
                                 labelCol=label,
                                 **kwargs)
        model = h2o_xgboost.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
示例#4
0
    def h2o_gbm(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_gbm = H2OGBM(ratio=0.8,
                         seed=1,
                         featuresCols=columns,
                         labelCol=label,
                         **kwargs)
        model = h2o_gbm.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
示例#5
0
    def h2o_deeplearning(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_deeplearning = H2ODeepLearning(epochs=10,
                                           seed=1,
                                           l1=0.001,
                                           l2=0.0,
                                           hidden=[200, 200],
                                           featuresCols=columns,
                                           labelCol=label,
                                           **kwargs)
        model = h2o_deeplearning.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
示例#6
0
    def h2o_automl(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        automl = H2OAutoML(
            convertUnknownCategoricalLevelsToNa=True,
            maxRuntimeSecs=60,  # 1 minutes
            seed=1,
            maxModels=3,
            labelCol=name_col(label, STRING_TO_INDEX),
            **kwargs)

        model = automl.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            F.when(df_raw.prediction_output["value"] > 0.5,
                   1.0).otherwise(0.0))

        return df_pred, model
示例#7
0
	def test_vector_assembler():
		actual_df =fe.vector_assembler(source_df,input_cols=['id', 'x', 'y'])
		expected_df = op.create.df([('id', LongType(), True),('x', LongType(), True),('y', LongType(), True),('features', VectorUDT(), True),('id_x_y******VECTOR_ASSEMBLER', VectorUDT(), True)], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0]), DenseVector([0.0, 1.0, 2.0])), (1, 2, 3, DenseVector([2.0, 1.0, 1.0]), DenseVector([1.0, 2.0, 3.0])), (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), DenseVector([2.0, 3.0, 4.0]))])
		assert (expected_df.collect() == actual_df.collect())