def random_forest(df, columns, input_col, **kwargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ columns = parse_columns(df, columns) data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats, output_col="features") model = RandomForestClassifier(**kwargs) df.table() df = df.cols.rename(name_col(input_col, "index_to_string"), "label") rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def gbt(df, columns, input_col, **kwargs): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats, output_col="features") model = GBTClassifier(**kwargs) df = df.cols.rename(name_col(input_col, "index_to_string"), "label") gbt_model = model.fit(df) df_model = gbt_model.transform(df) return df_model, gbt_model
def decision_tree(df, columns, input_col, **kargs): """ Runs a decision tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with decision tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = DecisionTreeClassifier(**kargs) df = df.cols.rename(name_col(input_col, "index"), "label") dt_model = model.fit(df) df_model = dt_model.transform(df) return df_model, dt_model
def random_forest(df, columns, input_col, **kargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = RandomForestClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def h2o_xgboost(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=columns, labelCol=label, **kargs) model = h2o_xgboost.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def h2o_gbm(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_gbm = H2OGBM(ratio=0.8, seed=1, featuresCols=columns, labelCol=label, **kargs) model = h2o_gbm.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def h2o_automl(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes seed=1, maxModels=3, labelCol=label + "_index", **kargs) model = automl.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def h2o_deeplearning(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_deeplearning = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=columns, labelCol=label, **kargs) model = h2o_deeplearning.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def test_string_to_index_kargs(): df = op.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) df_indexed = fe.string_to_index(df, "category", stringOrderType="frequencyAsc") assert_spark_df(df_indexed) expected_collect = op.sc.parallelize([ Row(id=0, category='a', category_index=2.0), Row(id=1, category='b', category_index=0.0), Row(id=2, category='c', category_index=1.0), Row(id=3, category='a', category_index=2.0), Row(id=4, category='a', category_index=2.0), Row(id=5, category='c', category_index=1.0) ]).toDF() assert_equal( df_indexed.select("category", "category_index", "id").collect(), expected_collect.collect())