from pyspark.sql.functions import col, lit, when, struct from pyspark.sql.column import Column from pyspark.sql import functions as F from pyspark.sql.dataframe import DataFrame from framework.feature_factory.feature import Feature from framework.spark_singleton import SparkSingleton from framework import feature_factory from datetime import datetime from datetime import timedelta import inspect from collections import OrderedDict import logging logger = logging.getLogger(__name__) spark = SparkSingleton.get_instance() class Helpers: def __init__(self): pass def _get_approx_distinct_count_for_col(self, df: DataFrame, _dcol: str, _rsd=0.05): return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \ .rdd.map(lambda row: row[0]).collect()[0] def _get_cat_feature_val_col(self, agg_col): if agg_col == 1:
def setUp(self): self.spark = SparkSingleton.get_instance() self.helpers = Helpers()