示例#1
0
def registry_etl(
    spark: SparkSession,
    config: confuse.core.Configuration,
    icd_to_desc_map: pd.DataFrame,
) -> pd.DataFrame:

    registry_claims_bucket = config["buckets"]["registry_claims"].get(str)
    log.info(f"Registry claim bucket: {registry_claims_bucket}")

    log.info(f"Reading in registry claims data")
    registry_rdd = spark.read.parquet(
        registry_claims_bucket.replace("s3:", "s3a:")).withColumnRenamed(
            "patient_id", "registry_id")

    # Select claims around patient reference date and filter out claims before 2017 to remove ICD9 codes
    registry_rdd = (
        registry_rdd.where(  # Filters to claims falling before reference date
            F.col("claim_date") <
            F.date_sub(F.col("reference_date"), 0)).where(
                F.col("claim_date") >
                F.date_sub(F.col("reference_date"),
                           1 * 365))  # TODO [Low] Move time length into YAML
        .where(F.col("reference_date") >
               F.lit("2017-01-01"))  # TODO [Low] Move cut-off date into YAML
    )
    registry_id_count = registry_rdd.select("registry_id").distinct().count()
    log.info(f"Registry ID Count: {registry_id_count}")

    registry_count_min = config["etl"]["registry_count_min"].get(int)
    log.info(f"ETL of registry data and bringing to pandas")
    registry_df = (registry_rdd.select(
        "registry_id",
        F.explode(F.col("dx_list")).alias("code")).where(
            F.col("code").isNotNull()).groupBy("code").agg(
                F.collect_set(F.col("registry_id")).alias("registry_ids"),
                F.countDistinct(F.col("registry_id")).alias("registry_count"),
            ).where(F.col("registry_count") > registry_count_min).withColumn(
                "registry_rate",
                F.col("registry_count") /
                F.lit(registry_id_count))).toPandas()

    # TODO [Medium] Move below this into Spark
    log.info(f"Registry ETL pandas operations")
    registry_df.drop(
        registry_df.index[~registry_df.code.isin(icd_to_desc_map.code)],
        inplace=True)
    registry_df.sort_values("code").reset_index(drop=True, inplace=True)
    n_total_test = int(
        round(
            registry_df["registry_count"].iloc[0] /
            registry_df["registry_rate"].iloc[0],
            0,
        ))
    log.info(f"N Total Test: {n_total_test}")
    registry_df["n_total_test"] = n_total_test
    return registry_df
示例#2
0
def get_previous_bus_day(df_calendar:pyspark.sql.DataFrame, p_end_dt, col_name: str):
    """
    this function return the previous business day key from the current day (p_end_dt)
    """
    df_calendar = df_calendar \
        .select(col_name) \
        .where(F.col("bus_day_flg") == F.lit("Y")) \
        .filter(F.col("cal_day").between(F.to_timestamp(F.date_sub(F.lit(p_end_dt), 7), "yyyy-MM-dd"),
                                         F.to_timestamp(F.date_sub(F.lit(p_end_dt), 1), "yyyy-MM-dd"))) \
        .orderBy(F.col("cal_day").desc()).collect()

    return df_calendar[0][col_name]
示例#3
0
    def test_date_sub_function(self):
        dt = datetime.date(2021, 12, 27)

        # Note; number var in Python gets converted to LongType column;
        # this is not supported by the function, so cast to Integer explicitly
        df = self.spark.createDataFrame([Row(date=dt, sub=2)],
                                        "date date, sub integer")

        self.assertTrue(
            all(
                df.select(
                    date_sub(df.date, df.sub) == datetime.date(2021, 12, 25),
                    date_sub(df.date, "sub") == datetime.date(2021, 12, 25),
                    date_sub(df.date, 3) == datetime.date(2021, 12, 24),
                ).first()))
        def cal_F1(submit, fault_tag_df):
            npp = submit.count()
            tmp = submit.join(fault_tag_df,
                              on=['model', 'serial_number'],
                              how='left')
            ntpp = tmp.filter(F.col('fault_time').isNotNull()).filter(
                F.date_sub(tmp['fault_time'], 30) <= tmp['dt']).count()
            precision = ntpp / (npp + 1e-20)

            pr_df = fault_tag_df.filter(
                fault_tag_df['fault_time'] >= test1_begin_date).filter(
                    fault_tag_df['fault_time'] < test1_end_date).repartition(
                        1)  # .filter(F.col('model')==1)
            npr = pr_df.count()

            ntpr = pr_df.join(submit, on=['model', 'serial_number'], how='left') \
                .filter(F.col('dt').isNotNull()).filter(F.col('fault_time') >= F.col('dt')).count()
            recall = ntpr / (npr + 1e-20)

            F1_score = 2 * precision * recall / (precision + recall + 1e-10)
            return ({
                'npp': npp,
                'ntpp': ntpp,
                'npr': npr,
                'ntpr': ntpr,
                'precision': precision,
                'recall': recall,
                'F1': F1_score
            })
示例#5
0
def county_reality_supply():
    # 各区县各档位该品规上周投放量
    try:
        print(f"{str(dt.now())} 各区县各档位该品规上周投放量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        co_cust = get_co_cust(spark).select("cust_id", "sale_center_id",
                                            "cust_seg")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 获取上周实际投放量
        # cust_item_spw = spark.sql(
        #     "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\
        #     .join(co_cust,"cust_id")

        cust_item_spw = spark.sql(
            "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
            .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
            .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \
            .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \
            .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\
            .join(co_cust,"cust_id")

        #需要计算的值的列名
        colName = "county_gauge_week_volume"
        result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \
                                .agg(f.sum("qty_allocco").alias(colName))

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "city", "gears_data_marker", colName
        ]
        result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("4")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .join(county,"sale_center_id")\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .withColumnRenamed("cust_seg","gears")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()
示例#6
0
def read_buste_data_v3(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath,
                                     header=True,
                                     inferSchema=True,
                                     nullValue="-")
    date = "-".join(filepath.split("/")[-1].split("_")[:3])
    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn(
        "date", F.unix_timestamp(F.date_sub(F.col("date"), 1), 'yyyy-MM-dd'))
    return data_frame
def get_listens_for_rec_generation_window(mapped_df):
    """ Get listens to fetch top artists.

        Args:
            mapped_df (dataframe): Dataframe with all the columns/fields that a typical listen has.
    """
    df = mapped_df.select('*') \
        .where((col('listened_at') >= to_timestamp(date_sub(current_timestamp(),
        config.RECOMMENDATION_GENERATION_WINDOW))) & (col('listened_at') <= current_timestamp()))
    return df
示例#8
0
def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath,
                                     header=True,
                                     inferSchema=True,
                                     nullValue="-")
    data_frame = rename_columns(data_frame, [
        ("cardNum18", "cardNum"),
        ("cardNum19", "userGender"),
    ])
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])
    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"), 1))
    return data_frame
示例#9
0
def clear_periods_intersections(source: DataFrame,
                                columns_subset: List[str]) -> DataFrame:
    """
    The method will transform periods "start_date - end_date" for subset of columns
    and change periods for linear continuations

    source:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-07-01|2018-09-01|
    |  5|2018-08-15|2018-09-13|
    |  5|2018-07-20|2018-08-30|
    +---+----------+----------+

    result:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-07-01|2018-09-13|
    +---+----------+----------+

    :param source: source data frame.
    :param columns_subset: subset of data frame columns that need to have not periods intersections
    """

    custom_window = Window.partitionBy(*columns_subset).orderBy('start_dt')

    result = source \
        .withColumn('lag_end', F.lag('end_date').over(custom_window)) \
        .withColumn('flg',
                    F.when(F.col('lag_end') < F.date_sub(F.col('start_date'), 1), 1)
                    .otherwise(0)) \
        .withColumn('grp', F.sum('flg').over(custom_window)) \
        .groupBy(*columns_subset, 'grp') \
        .agg(F.min('start_date').alias('start_date'),
             F.max('end_date').alias('end_date')) \
        .drop('grp')

    return result
示例#10
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, current_timestamp, date_sub, date_add, datediff, months_between, to_date, lit
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()
dateDf = spark.range(10).withColumn("Today", current_date()).withColumn(
    "Now", current_timestamp())
dateDf.createOrReplaceTempView("dftable")
dateDf.show()
dateDf.printSchema()

#'date_sub' and 'date_add' can be used to add and subtract from a particular date
dateDf.select(date_sub(col("Today"), 5), date_add(col("Today"), 4)).show(2)

#'dateddiff' and 'months_between' functions give number of dates and number of months between two dates
dateDf.withColumn("week_ago",
                  date_sub(col("Today"),
                           7)).select(datediff(col("week_ago"),
                                               col("Today"))).show(1)

dateformat = "yyyy-dd-MM"
dateDF1 = spark.range(1).select(
    to_date(lit("2020-27-08"), dateformat).alias("date"),
    to_date(lit("2020-26-08"), dateformat).alias("date1"))
dateDF1.createOrReplaceTempView("dataTable2")
dateDF1.show()
示例#11
0
def clean_columns(prepared_clients, effective_version, start_ds):
    """Clean columns in preparation for aggregation.

    This removes invalid values, tidies up dates, and limits the scope of
    several dimensions.

    :param prepared_clients:    `main_summary` rows that conform to the
                                schema of `prepare_client_rows(...)`
    :param effective_version:   DataFrame mapping dates to the active Firefox
                                version that was distributed at that time
    :param start_ds:            DateString to determine whether a row that is
                                being processed is active during the current
                                week
    :returns:                   DataFrame with cleaned columns and rows
    """

    # Temporary column used for determining the validity of a row
    is_valid = "_is_valid"

    pcd = F.from_unixtime(F.col("profile_creation_date") * SECONDS_PER_DAY)
    client_date = utils.to_datetime('subsession_start_date', "yyyy-MM-dd")
    days_since_creation = F.datediff(client_date, pcd)

    is_funnelcake = F.col('distribution_id').rlike("^mozilla[0-9]+.*$")

    attr_mapping = {
        'distribution_id':
        None,
        'default_search_engine':
        None,
        'locale':
        None,
        'subsession_start':
        client_date,
        'channel': (F.when(
            is_funnelcake,
            F.concat(F.col("normalized_channel"), F.lit("-cck-"),
                     F.col("distribution_id"))).otherwise(
                         F.col(("normalized_channel")))),
        'geo':
        in_top_countries("country"),
        # Bug 1289573: Support values like "mozilla86" and "mozilla86-utility-existing"
        'is_funnelcake': (F.when(is_funnelcake,
                                 F.lit("yes")).otherwise(F.lit("no"))),
        'acquisition_period':
        F.date_format(F.date_sub(F.next_day(pcd, 'Sun'), 7), "yyyy-MM-dd"),
        'sync_usage':
        sync_usage("sync_count_desktop", "sync_count_mobile",
                   "sync_configured"),
        'current_version':
        F.col("app_version"),
        'current_week': (
            # -1 is a placeholder for bad data
            F.when(days_since_creation < 0,
                   F.lit(-1)).otherwise(F.floor(days_since_creation / 7)
                                        ).cast("long")),
        'source':
        F.col('attribution.source'),
        'medium':
        F.col('attribution.medium'),
        'campaign':
        F.col('attribution.campaign'),
        'content':
        F.col('attribution.content'),
        'is_active': (F.when(client_date < utils.to_datetime(F.lit(start_ds)),
                             F.lit("no")).otherwise(F.lit("yes"))),
    }

    usage_hours = F.col('usage_seconds') / SECONDS_PER_HOUR
    metric_mapping = {
        'n_profiles': F.lit(1),
        'total_uri_count': None,
        'unique_domains_count_per_profile': None,
        'usage_hours': usage_hours,
        'sum_squared_usage_hours': F.pow(usage_hours, 2),
    }

    # Set the attributes to null if it's invalid
    select_attr = utils.build_col_expr({
        attr: F.when(F.col(is_valid), expr).otherwise(F.lit(None))
        for attr, expr in utils.preprocess_col_expr(attr_mapping).items()
    })
    select_metrics = utils.build_col_expr(metric_mapping)
    select_expr = select_attr + select_metrics

    cleaned_data = (
        # Compile per-client rows for the current retention period
        prepared_clients
        # Filter out seemingly impossible rows. One very obvious notion
        # is to make sure that a profile is always created before a sub-session.
        # Unlike `sane_date` in previous versions, this is idempotent and only
        # depends on the data.
        .withColumn(
            "profile_creation", F.date_format(pcd, 'yyyy-MM-dd')).withColumn(
                is_valid,
                (F.col("profile_creation").isNotNull() &
                 (F.col("profile_creation") > DEFAULT_DATE) &
                 (pcd <= client_date))).select(
                     # avoid acquisition dates in the future
                     (F.when(
                         F.col(is_valid), F.col("profile_creation")).otherwise(
                             F.lit(None)).alias("profile_creation")),
                     *select_expr)
        # Set default values for the rows
        .fillna({
            'acquisition_period': DEFAULT_DATE,
            'is_funnelcake': "no",
            "current_week": -1,
        }).fillna(0).fillna(0.0).fillna('unknown'))
    result = release.with_effective_version(cleaned_data, effective_version,
                                            "profile_creation")

    return result
def df_smartstore_sourcing_one_day_ago(df_smartstore_sourcing):
    yield df_smartstore_sourcing.withColumn(
        DATE_ID_COLUMN_NAME, date_sub(col(DATE_ID_COLUMN_NAME), 1)
    )
示例#13
0
# 4. 去重:
df.select('name').dropDuplicates().show()

# 5. 分割字符串 :
df.withColumn("splited_name",split(df_webpages["name"],",")[1]).show()


"""操作Date"""
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10).withColumn("today",current_date())\
                        .withColumn("now",current_timestamp())

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"),5), date_add(col("today"),5))

# 時間差
from pyspark.sql.functions import to_date, datediff, months_between
dateDF.select(to_date(lit("2020-06-01")).alias("start"),\
              to_date(lit("2022-06-01")).alias("end"))\
              .select(months_between(col("start"),col("end"))).show(1)

"""處理遺漏值"""
df.na.drop()
df.na.fill(0)

"""處理多型態"""
# array是否包含某個值 array_contains()

# first() last()
#for debugging
print(corona_in.printSchema())     
corona_in.show(10)

#select only necessary rows
corona = corona_in.select(to_date('dateRep', format='dd/MM/yyyy').alias('date'), 
                            corona_in.cases,
                            corona_in.deaths,
                            corona_in.countriesAndTerritories.alias('country'),
                            corona_in.continentExp.alias('continent'))

corona0 = corona.filter(corona.continent == "Europe")
corona0.show(50)

#calculate relative difference to previous day for number of cases and death
corona1 = corona0.withColumn("prev_date", date_sub(corona.date, 1))

#regester dataframe as temporary view for querieng and transforming it
corona1.createOrReplaceTempView("corona_out")
corona1.show(10)

corona_out1 = spark.sql("SELECT cor1.date, cor1.cases, cor1.deaths, cor1.country, cor1.continent, \
                        cor2.cases as cases_prev, cor2.deaths as dead_prev \
                        FROM corona_out as cor1  \
                        LEFT OUTER JOIN  corona_out as cor2 on cor2.date = cor1.prev_date and cor1.country=cor2.country ")

corona_out = corona_out1.withColumn("cases_rel_diff", expr("(cases-cases_prev)/cases_prev"))\
                        .withColumn("deaths_rel_diff", expr("(deaths-dead_prev)/dead_prev"))
#for debugging
corona_out.show(10)
示例#15
0
def get_item_stats_info_weekly():
    # 市/区每款卷烟上一周销量/环比/同比
    try:
        # 烟id,烟名称
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 标识列的值
        markers = ["1", "3"]
        # 按照 市或区统计
        groups = ["com_id", "sale_center_id"]
        joins = [city, county]
        # 除需要计算的值,其他的数据
        cols_comm = [["city", "gauge_id", "gauge_name", "ciga_data_marker"],
                     [
                         "county", "sale_center_id", "gauge_id", "gauge_name",
                         "ciga_data_marker"
                     ]]
        # 需要计算的值的列名
        cols = [[
            "gauge_city_week_sales", "gauge_city_week_sales_last_year",
            "gauge_city_week2_sales"
        ],
                [
                    "gauge_county_week_sales",
                    "gauge_county_week_sales_last_year",
                    "gauge_county_week2_sales"
                ]]

        co_co_line = get_co_co_line(spark, scope=[1, 2], filter="week") \
            .select("item_id", "com_id", "sale_center_id", "qty_ord", "week_diff")

        co_co_line.cache()

        last_year = spark.sql("select  * from DB2_DB2INST1_CO_CO_LINE") \
            .where(col("com_id").isin(cities)) \
            .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
            .withColumn("last_year_today", f.date_sub(f.current_date(), 365)) \
            .withColumn("week_diff", week_diff("last_year_today", "born_date")) \
            .where(col("week_diff") == 1) \
            .withColumn("qty_ord", col("qty_ord").cast("float")) \
            .select("com_id","sale_center_id", "item_id", "qty_ord")
        last_year.cache()
        for i in range(len(groups)):
            group = groups[i]
            join = joins[i]
            c = cols[i]
            marker = markers[i]
            try:
                # 获取上一周订单行表数据
                last_week = co_co_line.where(col("week_diff") == 1)

                # 1.市/区每款烟的订单量
                print(f"{str(dt.now())}  上一周{group}每款烟的订单量")
                qty_amt = last_week.groupBy(group, "item_id").agg(
                    f.sum(col("qty_ord")).alias(c[0]))

                # -------------------环比
                # 获取上第二周订单行表数据
                last_two_week = co_co_line.where(col("week_diff") == 2)

                # 2.市/区每款卷烟销量 上次同期
                print(f"{str(dt.now())}  上第二周{group}每款烟的订单量")
                qty_amt_last = last_two_week.groupBy(group, "item_id").agg(
                    f.sum(col("qty_ord")).alias(c[1]))

                # # 2.市/区每款卷烟销量 环比
                # print(f"{str(dt.now())}  上一周{group}每款烟销量环比")
                # qty_amt_last.join(qty_amt, [group, "item_id"]) \
                #     .withColumn("qty_ring_rate", period_udf(col("qty_ord"), col("qty_ord_last")))

                # --------------------同比

                # 3.市/区每款烟的订单量 去年
                print(f"{str(dt.now())}  去年同期{group}每款烟的订单量")
                qty_ord_ly = last_year.groupBy(group, "item_id") \
                    .agg(f.sum(col("qty_ord")).alias(c[2]))

                # #3 市/区每款卷烟销量同比
                # print(f"{str(dt.now())}  上一周{group}每款烟销量同比")
                # qty_ord_ly.join(qty_amt, [group, "item_id"]) \
                #     .withColumn("qty_ord_yoy", period_udf(col("qty_ord"), col("qty_ord_ly")))

                all_df = qty_amt.join(qty_amt_last, [group, "item_id"], "outer") \
                    .join(qty_ord_ly, [group, "item_id"], "outer")\
                    .na.fill(0,c)

                columns = cols_comm[i] + c
                all_df.withColumn("row", f.concat_ws("_", col(group), col("item_id"))) \
                    .join(plm_item, "item_id") \
                    .join(join, group) \
                    .withColumnRenamed("item_id", "gauge_id") \
                    .withColumnRenamed("item_name", "gauge_name") \
                    .withColumn("ciga_data_marker", f.lit(marker)) \
                    .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
            except Exception:
                tb.print_exc()
        co_co_line.unpersist()
        last_year.unpersist()
    except Exception:
        tb.print_exc()
示例#16
0

'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created'''
df_nycflights = df_nycflights. \
                drop('year'). \
                drop('month'). \
                drop('day'). \
                drop('hour'). \
                drop('minute'). \
                drop('date'). \
                drop('time')

df_nycflights.show() 

'''Now we extract the fields back'''
df_nycflights = df_nycflights. \
                withColumn('year',year(df_nycflights.timestamp)). \
                withColumn('month',month(df_nycflights.timestamp)). \
                withColumn('day',dayofmonth(df_nycflights.timestamp)). \
                withColumn('hour',hour(df_nycflights.timestamp)). \
                withColumn('minute',minute(df_nycflights.timestamp))  

df_nycflights.show()

'''Now few operations on timestamp '''
df_nycflights = df_nycflights.\
                withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \
                withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \
                withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp))

df_nycflights.show()                 
示例#17
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, current_date, \
    current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()

    spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show()

    spark.read.jdbc

    spark.range(5)\
         .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\
         .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show()

    spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'),
                          to_date(lit('2019-01-05')).alias('date2'))\
                  .select(datediff(col('date2'), col('date1')),
                          months_between(col('date2'), col('date1'))).show()

    spark.range(5).withColumn('date', to_date(
        lit('2019-XX-XX'))).show()  #No emite excepcion

    spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \
                  .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \
        .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \
                  .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show()

    spark.range(5).select(current_timestamp().alias("timestamp")).select(
        hour(col('date')))
示例#18
0
# There are dates which focus exclusively on calendar dates, and timestamps, which include both data and time
#   - dates focus on calendar dates
#   - timestamps focus on data and time
# Spark will make a best effort to correctly identify column types, including dates and timestamps when we enable inferSchema
# Spark can be a bit particular about what format you have at any given point in time
# It's important to be explicit when parsing or converting to ensure that there are not issues in doing so

from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date

dateDF = spark.range(10).alias('number')\
        .withColumn('today', current_date())\
        .withColumn('now', current_timestamp())
dateDF.createOrReplaceTempView('dateTable')
dateDF.printSchema()

dateDF.select(date_sub(col('today'), 5), date_add(col('today'), 5)).show(1)

dateDF.selectExpr(
        'date_sub(today, 5)',
        'date_add(today, 5)'
        ).show(1)
# select date_sub(today, 5), date_add(today, 5) from dateTable

# Another task is to take a look at the difference between two dates. We can do this with datediff function
dateDF.withColumn('week_ago', date_sub(col('today'), 7))\
        .select(datediff(col('week_ago'), col('today'))).show(1)

dateDF.withColumn('week_ago', date_sub(col('today'), 7))\
        .selectExpr('datediff(week_ago, today)').show(1)

spark.sql('select datediff(week_ago, today) from (select *, date_sub(today, 7) as week_ago from dateTable)')
示例#19
0
def process_sdf(sdf_drive, sdf_vehicle):
  sdf_join_drive_vehicle = sdf_drive.alias("drive").join(sdf_vehicle.alias("vehicle"), ["vehicle_id"])
  sdf_join_drive_vehicle_fillna = sdf_join_drive_vehicle.fillna(0)
  sdf_drive_start_of_week = sdf_join_drive_vehicle_fillna.withColumn("week_start_date", \
                                                                     (F.date_sub(F.next_day(
                                                                       F.from_utc_timestamp(F.col("datetime"),
                                                                                            "America/New_York"),
                                                                       'monday'), 7)))

  sdf_Active_horsepower =  sdf_drive_start_of_week.withColumn("Active_horsepower" ,  (F.col("eng_load") / 255) \
                                                              * (F.col("max_torque") * F.col("rpm"))  / 5252)

  # Horsepower utilization – Active horsepower / Max Horsepower
  sdf_Horsepower_utilization = sdf_Active_horsepower.withColumn("Horsepower_utilization", F.col("Active_horsepower") / F.col("max_horsepower"))

  # # Torque Utilization - calculated as Engine load/ 255
  sdf_Torque_Utilization = sdf_Horsepower_utilization.withColumn("Torque_Utilization", F.col("eng_load") / 255)

  # # RPM Utilization – RPM / Maximum horsepower rpm
  sdf_RPM_Utilization = sdf_Torque_Utilization.withColumn("RPM_Utilization", F.col("rpm") / F.col("max_horsepower_rpm") )

  sdf_engine_features = sdf_RPM_Utilization.withColumn("ft_torque_util_60pct_s",
                                                       F.when((F.col("Torque_Utilization") >= 0.6) \
                                                              & (F.col("Torque_Utilization") < 0.7), \
                                                              F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_70pct_s", F.when((F.col("Torque_Utilization") >= 0.7) \
                                                 & (F.col("Torque_Utilization") < 0.8), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_80pct_s", F.when((F.col("Torque_Utilization") >= 0.8) \
                                                 & (F.col("Torque_Utilization") < 0.9), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_90pct_s", F.when((F.col("Torque_Utilization") >= 0.9) \
                                                 & (F.col("Torque_Utilization") < 1), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_50pct_s", F.when((F.col("Horsepower_utilization") >= 0.5) \
                                                     & (F.col("Horsepower_utilization") < 0.6), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_60pct_s", F.when((F.col("Horsepower_utilization") >= 0.6) \
                                                     & (F.col("Horsepower_utilization") < 0.7), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_70pct_s", F.when((F.col("Horsepower_utilization") >= 0.7) \
                                                     & (F.col("Horsepower_utilization") < 0.8), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_80pct_s", F.when((F.col("Horsepower_utilization") >= 0.8) \
                                                     & (F.col("Horsepower_utilization") < 0.9), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_rpm_util_50pct_s", F.when((F.col("RPM_Utilization") >= 0.5) \
                                              & (F.col("RPM_Utilization") < 0.6), \
                                              F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_rpm_util_60pct_s", F.when((F.col("RPM_Utilization") >= 0.6) \
                                              & (F.col("RPM_Utilization") < 0.7), \
                                              F.lit(1)).otherwise(F.lit(0))) \
 \
          sdf_engine_features_total = sdf_engine_features.select("vehicle_id", "week_start_date", "datetime", \
                                                                 "ft_torque_util_60pct_s", "ft_torque_util_70pct_s",
                                                                 "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \
                                                                 "ft_horsepower_util_50pct_s",
                                                                 "ft_horsepower_util_60pct_s",
                                                                 "ft_horsepower_util_70pct_s",
                                                                 "ft_horsepower_util_80pct_s", \
                                                                 "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s")

  sdf_sdf_engine_features_agg = sdf_engine_features_total.groupBy("vehicle_id", "week_start_date") \
    .agg(F.sum("ft_torque_util_60pct_s").alias("ft_torque_util_60pct_s"), \
         F.sum("ft_torque_util_70pct_s").alias("ft_torque_util_70pct_s"), \
         F.sum("ft_torque_util_80pct_s").alias("ft_torque_util_80pct_s"), \
         F.sum("ft_torque_util_90pct_s").alias("ft_torque_util_90pct_s"), \
         F.sum("ft_horsepower_util_50pct_s").alias("ft_horsepower_util_50pct_s"), \
         F.min("ft_horsepower_util_60pct_s").alias("ft_horsepower_util_60pct_s"), \
         F.min("ft_horsepower_util_70pct_s").alias("ft_horsepower_util_70pct_s"), \
         F.min("ft_horsepower_util_80pct_s").alias("ft_horsepower_util_80pct_s"), \
         F.min("ft_rpm_util_50pct_s").alias("ft_rpm_util_50pct_s"), \
         F.min("ft_rpm_util_60pct_s").alias("ft_rpm_util_60pct_s"), )

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_agg.select("vehicle_id", "week_start_date", \
                                                                     "ft_torque_util_60pct_s", "ft_torque_util_70pct_s",
                                                                     "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \
                                                                     "ft_horsepower_util_50pct_s",
                                                                     "ft_horsepower_util_60pct_s",
                                                                     "ft_horsepower_util_70pct_s",
                                                                     "ft_horsepower_util_80pct_s", \
                                                                     "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s")

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.sort(F.col("vehicle_id"), F.col("week_start_date"))

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.withColumn("week_start_date",
                                                                           F.date_format(F.col("week_start_date"),
                                                                                         "yyyy-MM-dd"))

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.fillna(0)

  return sdf_sdf_engine_features_final
示例#20
0
# In[15]:


df.printSchema()


# In[16]:


df.show()


# In[17]:


df.withColumn("date_sub_10",F.date_sub("date",10)).show()


# In[18]:


df.withColumn("date_add_10",F.date_add("date",20)).show()


# In[ ]:


# difference in days between today and date columns


# In[19]:
示例#21
0
retailOri = retail.select(col('CustomerID'),totalQty.alias('OriginalQty'))
select *,POWER((UnitPrice*Quantity),2.0)+5 as OriginalQty from dfTable #SparkSql
df = df.select(round(lit('2.5')),bround(lit('2.5'))) #from pyspark.sql.functions import lit,round,bround ans: 3(round),2(bround)

#pearson correlation(-1 to 1), 1 means both are linearly corelated, and -1 means inversly related, 0 means no relation between two
from pyspark.sql.functions import corr
corrr = retailOri.select(corr('Quantity','UnitPrice'))

#regular expressions
retailexp = retail.select(regexp_exp(col('Description'),'BLUE|ADVENT|WHITE','color'))
retailext = retail.select(regexp_extract(col('Description'),'(BLUE|ADVENT)',1))

#date&Time
from pyspark.sql.functions import current_date,current_timestamp,date_add,date_sub,datediff,months_between,to_date
df = spark.range(5).withColumn('Today',current_date()).withColumn('now',current_timestamp())
dfdiff = df.withColumn('weekAgo',date_sub(col('Today'),7)).withColumn('weekLater',date_add(col('Today'),7)).select('*')
df = df.withColumn('diff',datediff(c1,c2))
dateFormat = 'yyyy-dd-MM'
dff = spark.range(1).withColumn('dates',to_date(lit("2019-20-12"),dateFormat)).select('dates')
dff = spark.range(1).withColumn('dates',to_timestamp(lit("2019-20-12"),dateFormat)).select('dates')

#with nulls
df.na.drop('all')#If all values in rows are null then that row will be dropped
df.na.drop('any')#IF any of the value is null that row will be dropped
df.na.drop('all',subset=['c1','c2'])
#fill
fillALlCols = {'StockCode':5,'Description':'It is null'}
df.na.fill(fillALlCols)
#replace
df.replace('[]',['Its null'],'Description')
complexDf = retail.select(struct('InvoiceNo','UnitPrice').alias('Complex'))
示例#22
0
sc = SparkContext()
spark = SparkSession(sc)

dateDF = spark.range(10)\
    .withColumn("today", current_date())\
    .withColumn("now", current_timestamp())

dateDF.createTempView("dateTable")

dateDF.printSchema()

#1 take days or add days
print("1")
dateDF.select(
    date_sub(col("today"), 5),
    date_add(col("today"), 5))\
.show(1)

#2 date diference
print("2")
dateDF\
    .withColumn("week_ago", date_sub(col("today"), 7))\
    .select(datediff(col("week_ago"), col("today")))\
    .show(1)

#3 between two month
print("3")
dateDF\
    .select(
    to_date(lit("2016-01-01")).alias("start"),
示例#23
0
    def _bin_time_stamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
        sql_utils = SparkContext._active_spark_context._jvm.PythonSQLUtils
        origin_scol = F.lit(origin)
        (rule_code, n) = (self._offset.rule_code, self._offset.n
                          )  # type: ignore[attr-defined]
        left_closed, right_closed = (self._closed == "left",
                                     self._closed == "right")
        left_labeled, right_labeled = (self._label == "left",
                                       self._label == "right")

        if rule_code == "A-DEC":
            assert (origin.month == 12 and origin.day == 31
                    and origin.hour == 0 and origin.minute == 0
                    and origin.second == 0)

            diff = F.year(ts_scol) - F.year(origin_scol)
            mod = F.lit(0) if n == 1 else (diff % n)
            edge_cond = (mod == 0) & (F.month(ts_scol)
                                      == 12) & (F.dayofmonth(ts_scol) == 31)

            edge_label = F.year(ts_scol)
            if left_closed and right_labeled:
                edge_label += n
            elif right_closed and left_labeled:
                edge_label -= n

            if left_labeled:
                non_edge_label = F.when(mod == 0,
                                        F.year(ts_scol) -
                                        n).otherwise(F.year(ts_scol) - mod)
            else:
                non_edge_label = F.when(
                    mod == 0,
                    F.year(ts_scol)).otherwise(F.year(ts_scol) - (mod - n))

            return F.to_timestamp(
                F.make_date(
                    F.when(edge_cond, edge_label).otherwise(non_edge_label),
                    F.lit(12), F.lit(31)))

        elif rule_code == "M":
            assert (origin.is_month_end and origin.hour == 0
                    and origin.minute == 0 and origin.second == 0)

            diff = ((F.year(ts_scol) - F.year(origin_scol)) * 12 +
                    F.month(ts_scol) - F.month(origin_scol))
            mod = F.lit(0) if n == 1 else (diff % n)
            edge_cond = (mod == 0) & (F.dayofmonth(ts_scol) == F.dayofmonth(
                F.last_day(ts_scol)))

            truncated_ts_scol = F.date_trunc("MONTH", ts_scol)
            edge_label = truncated_ts_scol
            if left_closed and right_labeled:
                edge_label += sql_utils.makeInterval("MONTH", F.lit(n)._jc)
            elif right_closed and left_labeled:
                edge_label -= sql_utils.makeInterval("MONTH", F.lit(n)._jc)

            if left_labeled:
                non_edge_label = F.when(
                    mod == 0,
                    truncated_ts_scol -
                    sql_utils.makeInterval("MONTH",
                                           F.lit(n)._jc),
                ).otherwise(truncated_ts_scol -
                            sql_utils.makeInterval("MONTH", mod._jc))
            else:
                non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise(
                    truncated_ts_scol -
                    sql_utils.makeInterval("MONTH", (mod - n)._jc))

            return F.to_timestamp(
                F.last_day(
                    F.when(edge_cond, edge_label).otherwise(non_edge_label)))

        elif rule_code == "D":
            assert origin.hour == 0 and origin.minute == 0 and origin.second == 0

            if n == 1:
                # NOTE: the logic to process '1D' is different from the cases with n>1,
                # since hour/minute/second parts are taken into account to determine edges!
                edge_cond = ((F.hour(ts_scol) == 0) & (F.minute(ts_scol) == 0)
                             & (F.second(ts_scol) == 0))

                if left_closed and left_labeled:
                    return F.date_trunc("DAY", ts_scol)
                elif left_closed and right_labeled:
                    return F.date_trunc("DAY", F.date_add(ts_scol, 1))
                elif right_closed and left_labeled:
                    return F.when(edge_cond,
                                  F.date_trunc("DAY", F.date_sub(
                                      ts_scol, 1))).otherwise(
                                          F.date_trunc("DAY", ts_scol))
                else:
                    return F.when(edge_cond,
                                  F.date_trunc("DAY", ts_scol)).otherwise(
                                      F.date_trunc("DAY",
                                                   F.date_add(ts_scol, 1)))

            else:
                diff = F.datediff(end=ts_scol, start=origin_scol)
                mod = diff % n

                edge_cond = mod == 0

                truncated_ts_scol = F.date_trunc("DAY", ts_scol)
                edge_label = truncated_ts_scol
                if left_closed and right_labeled:
                    edge_label = F.date_add(truncated_ts_scol, n)
                elif right_closed and left_labeled:
                    edge_label = F.date_sub(truncated_ts_scol, n)

                if left_labeled:
                    non_edge_label = F.date_sub(truncated_ts_scol, mod)
                else:
                    non_edge_label = F.date_sub(truncated_ts_scol, mod - n)

                return F.when(edge_cond, edge_label).otherwise(non_edge_label)

        elif rule_code in ["H", "T", "S"]:
            unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"}
            unit_str = unit_mapping[rule_code]

            truncated_ts_scol = F.date_trunc(unit_str, ts_scol)
            diff = sql_utils.timestampDiff(unit_str, origin_scol._jc,
                                           truncated_ts_scol._jc)
            mod = F.lit(0) if n == 1 else (diff % F.lit(n))

            if rule_code == "H":
                assert origin.minute == 0 and origin.second == 0
                edge_cond = (mod == 0) & (F.minute(ts_scol)
                                          == 0) & (F.second(ts_scol) == 0)
            elif rule_code == "T":
                assert origin.second == 0
                edge_cond = (mod == 0) & (F.second(ts_scol) == 0)
            else:
                edge_cond = mod == 0

            edge_label = truncated_ts_scol
            if left_closed and right_labeled:
                edge_label += sql_utils.makeInterval(unit_str, F.lit(n)._jc)
            elif right_closed and left_labeled:
                edge_label -= sql_utils.makeInterval(unit_str, F.lit(n)._jc)

            if left_labeled:
                non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise(
                    truncated_ts_scol -
                    sql_utils.makeInterval(unit_str, mod._jc))
            else:
                non_edge_label = F.when(
                    mod == 0,
                    truncated_ts_scol +
                    sql_utils.makeInterval(unit_str,
                                           F.lit(n)._jc),
                ).otherwise(truncated_ts_scol -
                            sql_utils.makeInterval(unit_str, (mod - n)._jc))

            return F.when(edge_cond, edge_label).otherwise(non_edge_label)

        else:
            raise ValueError("Got the unexpected unit {}".format(rule_code))
示例#24
0
    df_merge.src_id.alias('id'), df_merge.src_attr.alias('attr'),
    lit(True).alias('is_current'),
    lit(False).alias('is_deleted'),
    df_merge.src_start_date.alias('start_date'),
    df_merge.src_end_date.alias('end_date'))
# For records that needs to be deleted
df_merge_p3 = df_merge.filter(
    df_merge.action == 'DELETE').select(column_names).withColumn(
        'is_current', lit(False)).withColumn('is_deleted', lit(True))
# For records that needs to be expired and then inserted
df_merge_p4_1 = df_merge.filter(df_merge.action == 'UPSERT').select(
    df_merge.src_id.alias('id'), df_merge.src_attr.alias('attr'),
    lit(True).alias('is_current'),
    lit(False).alias('is_deleted'),
    df_merge.src_start_date.alias('start_date'),
    df_merge.src_end_date.alias('end_date'))
df_merge_p4_2 = df_merge.filter(df_merge.action == 'UPSERT').withColumn(
    'end_date', date_sub(df_merge.src_start_date, 1)).withColumn(
        'is_current', lit(False)).withColumn('is_deleted',
                                             lit(False)).select(column_names)
# Union all records together
df_merge_final = df_merge_p1.unionAll(df_merge_p2).unionAll(
    df_merge_p3).unionAll(df_merge_p4_1).unionAll(df_merge_p4_2)
df_merge_final.orderBy(['id', 'start_date']).show()
# At last, you can overwrite existing data using this new data frame.
# ...

# COMMAND ----------

DF
示例#25
0
df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()

#convert string to date, default format is 'YYYY-MM-DD'
spark.range(1).select(
    to_date(lit("2019-02-01")).alias("start_date"),
    to_date(lit("2019-03-06")).alias("end_date")).select(
        datediff(col("start_date"), col("end_date"))).show()

cleanDateDF = spark.range(1).withColumn("date1", current_date())
date_format = 'YYYY-MM-DD'
cleanDateDF.select(to_timestamp(col("date1"), date_format)).show()

#working with nulls in data

#coalesce, returns first not null value from a set of columns
示例#26
0
def udf_get_sunday(date):
    return F.date_sub(F.next_day(date, "sunday"), 7)
  .select("Description").show(3, False)


# COMMAND ----------

from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)


# COMMAND ----------
    fileLog("bc data has been read successfully")
    bc = bc.orderBy('customer_ID','meter_reading_date')
# discarding agreements who have only 1 meter_reading_date : Also combining the step to find max reading date per concat_agreement
    bc = bc.withColumn('CONCAT_AGMNT_NO',F.concat(F.col('CUSTOMER_ID'),F.col('ACCOUNT_SEQ'),F.col('AGREEMENT_SEQ')).cast(LongType()))
    grouped = bc.groupBy('CONCAT_AGMNT_NO').agg(count('meter_reading_date').alias('agr_count'),max('meter_reading_date').alias('Billing1_End_Date')).where(F.col('agr_count') > 1)
# join it with bc to get full column details
    df1 = bc.alias('df1')
    df2 = grouped.alias('df2')
    bc_req = df1.join(df2,['CONCAT_AGMNT_NO']).select('df1.*',df2.Billing1_End_Date)
    billingcycle = bc_req
# get max date for every agreement
    billingcycle = bc_req.groupBy('CONCAT_AGMNT_NO','customer_id','account_seq','agreement_seq').agg(max('meter_reading_date').alias('Billing1_End_Date'))
# generate last 13 billing cycles

    billingcycle = billingcycle.withColumn('Billing1_Start_Date',date_sub(F.col('Billing1_End_Date'),29))
    billingcycle = billingcycle.withColumn('Billing2_End_Date',date_sub(F.col('Billing1_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing2_Start_Date',date_sub(F.col('Billing2_End_Date'),29))

    billingcycle = billingcycle.withColumn('Billing3_End_Date',date_sub(F.col('Billing2_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing3_Start_Date',date_sub(F.col('Billing3_End_Date'),29))
    billingcycle = billingcycle.withColumn('Billing4_End_Date',date_sub(F.col('Billing3_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing4_Start_Date',date_sub(F.col('Billing4_End_Date'),29))
    billingcycle = billingcycle.withColumn('Billing5_End_Date',date_sub(F.col('Billing4_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing5_Start_Date',date_sub(F.col('Billing5_End_Date'),29))
    billingcycle = billingcycle.withColumn('Billing6_End_Date',date_sub(F.col('Billing5_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing6_Start_Date',date_sub(F.col('Billing6_End_Date'),29))
    billingcycle = billingcycle.withColumn('Billing7_End_Date',date_sub(F.col('Billing6_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing7_Start_Date',date_sub(F.col('Billing7_End_Date'),29))
    billingcycle = billingcycle.withColumn('Billing8_End_Date',date_sub(F.col('Billing7_Start_Date'),1))
    billingcycle = billingcycle.withColumn('Billing8_Start_Date',date_sub(F.col('Billing8_End_Date'),29))
    df2 = grouped.alias('df2')
    bc_req = df1.join(df2, ['CONCAT_AGMNT_NO']).select('df1.*',
                                                       df2.Billing1_End_Date)
    billingcycle = bc_req
    fileLog(
        "counts of unique aggreements in billing cycle before joining are: " +
        str(billingcycle.select('CONCAT_AGMNT_NO').distinct().count()))

    # get max date for every agreement
    billingcycle = bc_req.groupBy(
        'CONCAT_AGMNT_NO', 'customer_id', 'account_seq', 'agreement_seq').agg(
            max('meter_reading_date').alias('Billing1_End_Date'))
    # generate last 13 billing cycles

    billingcycle = billingcycle.withColumn(
        'Billing1_Start_Date', date_sub(F.col('Billing1_End_Date'), 29))
    billingcycle = billingcycle.withColumn(
        'Billing2_End_Date', date_sub(F.col('Billing1_Start_Date'), 1))
    billingcycle = billingcycle.withColumn(
        'Billing2_Start_Date', date_sub(F.col('Billing2_End_Date'), 29))
    billingcycle = billingcycle.withColumn(
        'Next_Billing_StartDate', date_add(F.col('Billing1_End_Date'), 1))
    billingcycle = billingcycle.withColumn(
        'Next_Billing_EndDate', date_add(F.col('Next_Billing_StartDate'), 29))

    billingcycle = billingcycle.select(
        'CONCAT_AGMNT_NO', 'CUSTOMER_ID', 'ACCOUNT_SEQ', 'AGREEMENT_SEQ',
        'Billing1_Start_Date', 'Billing1_End_Date', 'Billing2_Start_Date',
        'Billing2_End_Date', 'Next_Billing_StartDate', 'Next_Billing_EndDate')
    fileLog("generated bc1, bc2 and bc14 for prediction")
pivot_dt = date(2020, 12, 1)
first_dt =  date(2020, 10, 1)
# TODO create function that construct path2conf through enviroment
path2conf = "../conf" # uncomment if run as py
# path2conf = "conf"

spark = create_spark_session('dataset_creation', n_executors=16, n_cores=8, executor_memory=32, driver_memory=64, )
partners = dict_from_file(f"{path2conf}/partners.json")
aggr = spark.table("sbx_t_team_mp_cmpn_ds.day_aggr")
aggr = aggr.where(F.col("report_dt") < last_dt)
aggr = aggr.where (F.col("client_city").isin(get_list_of_cities(partner, path2conf=path2conf)))
sales = spark.table("sbx_t_team_mp_cmpn_ds.dm_partner_sales")
sales = sales.where(F.col("partner_name") == partners[partner])
sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt))
sales = sales.withColumn("target", F.lit(1))
sales = sales.withColumn("report_dt", F.date_sub('evt_dt', 3))
# sales = sales.withColumnRenamed("evt_dt", "report_dt")
sales = sales.select("epk_id", "report_dt", "target")

# TODO crete good algorithm to do this shit
dataset = aggr.join(sales, ['epk_id', "report_dt"], how="left")
print(dataset.select(F.sum("target").alias('amount_of')).show())
dataset = dataset.fillna({"target": 0})
dataset = dataset.where(F.col("report_dt").isNotNull())
train = dataset.where(F.col("report_dt") < pivot_dt).sampleBy("target", fractions={1: 0.05, 0: 0.95})
# sdf2cluster(train, f'{partner}_train')
spark.sql(f"drop table if exists sbx_t_team_mp_cmpn_ds.{partner}_train")
delete_folder_from_ps(f'{partner}_train')
save2ps(train, f'{partner}_train', partition="report_dt")
oot = dataset.where(F.col("report_dt") >= pivot_dt).sampleBy("target", fractions={1: 0.03, 0: 0.97})
# sdf2cluster(oot, f'{partner}_oot')