def random_forest(df, columns, input_col, **kwargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ columns = parse_columns(df, columns) data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats, output_col="features") model = RandomForestClassifier(**kwargs) df.table() df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label") rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def gbt(df, columns, input_col, **kwargs): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats, output_col="features") model = GBTClassifier(**kwargs) df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label") gbt_model = model.fit(df) df_model = gbt_model.transform(df) return df_model, gbt_model
def test_string_to_index_kargs(): df = op.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) df_indexed = fe.string_to_index(df, "category", stringOrderType="frequencyAsc") assert_spark_df(df_indexed) expected_collect = op.sc.parallelize([Row(id=0, category='a', category_index=2.0), Row(id=1, category='b', category_index=0.0), Row(id=2, category='c', category_index=1.0), Row(id=3, category='a', category_index=2.0), Row(id=4, category='a', category_index=2.0), Row(id=5, category='c', category_index=1.0)]).toDF() assert_equal(df_indexed.select("category", "category***INDEX_TO_STRING", "id").collect(), expected_collect.collect())
def h2o_xgboost(df, label, columns, **kwargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=columns, labelCol=label, **kwargs) model = h2o_xgboost.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn( "prediction", F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def h2o_gbm(df, label, columns, **kwargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_gbm = H2OGBM(ratio=0.8, seed=1, featuresCols=columns, labelCol=label, **kwargs) model = h2o_gbm.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn( "prediction", F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def h2o_deeplearning(df, label, columns, **kwargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_deeplearning = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=columns, labelCol=label, **kwargs) model = h2o_deeplearning.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn( "prediction", F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def h2o_automl(df, label, columns, **kwargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) automl = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes seed=1, maxModels=3, labelCol=name_col(label, STRING_TO_INDEX), **kwargs) model = automl.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn( "prediction", F.when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model