Пример #1
0
from pyspark.sql.functions import col, lit, when, struct
from pyspark.sql.column import Column
from pyspark.sql import functions as F
from pyspark.sql.dataframe import DataFrame
from framework.feature_factory.feature import Feature
from framework.spark_singleton import SparkSingleton
from framework import feature_factory

from datetime import datetime
from datetime import timedelta
import inspect
from collections import OrderedDict
import logging

logger = logging.getLogger(__name__)
spark = SparkSingleton.get_instance()


class Helpers:
    def __init__(self):
        pass

    def _get_approx_distinct_count_for_col(self,
                                           df: DataFrame,
                                           _dcol: str,
                                           _rsd=0.05):
        return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \
            .rdd.map(lambda row: row[0]).collect()[0]

    def _get_cat_feature_val_col(self, agg_col):
        if agg_col == 1:
Пример #2
0
 def setUp(self):
     self.spark = SparkSingleton.get_instance()
     self.helpers = Helpers()