示例#1
0
def gen_summary(df, output_prefix=""):
    summary = {}

    string_cols = []
    boolean_cols = []
    numeric_cols = []
    other_cols = []

    for field in df.schema.fields:
        if isinstance(field.dataType, T.StringType):
            string_cols.append(field.name)
        elif isinstance(field.dataType, T.BooleanType):
            boolean_cols.append(field.name)
        elif isnumeric(field.dataType):
            numeric_cols.append(field.name)
        else:
            other_cols.append(field.name)

    counts = cardinalities(df, string_cols)
    uniques = likely_unique(counts)
    categoricals = unique_values(df, likely_categoricals(counts))

    for span in [2, 3, 4, 6, 12]:
        thecube = df.cube(
            "Churn",
            F.ceil(df.tenure / span).alias("%d_month_spans" % span), "gender",
            "Partner", "SeniorCitizen", "Contract", "PaperlessBilling",
            "PaymentMethod",
            F.ceil(F.log2(F.col("MonthlyCharges")) *
                   10).alias("log_charges")).count()
        therollup = df.rollup(
            "Churn",
            F.ceil(df.tenure / span).alias("%d_month_spans" % span),
            "SeniorCitizen", "Contract", "PaperlessBilling", "PaymentMethod",
            F.ceil(F.log2(F.col("MonthlyCharges")) *
                   10).alias("log_charges")).agg(
                       F.sum(F.col("TotalCharges")).alias("sum_charges"))
        thecube.write.mode("overwrite").parquet("%scube-%d.parquet" %
                                                (output_prefix, span))
        therollup.write.mode("overwrite").parquet("%srollup-%d.parquet" %
                                                  (output_prefix, span))

    encoding_struct = {
        "categorical": categoricals,
        "numeric": numeric_cols + boolean_cols,
        "unique": uniques
    }

    summary["schema"] = df.schema.jsonValue()
    summary["ecdfs"] = approx_ecdf(df, numeric_cols)
    summary["true_percentage"] = percent_true(df, boolean_cols)
    summary["encoding"] = encoding_struct
    summary["distinct_customers"] = df.select(df.customerID).distinct().count()

    return summary
示例#2
0
def casting_data(df):
    """
    Casting of the data to double timestamp, unix or long
    """
    df = df.withColumn("X", df["X"].cast("double"))
    df = df.withColumn("Y", df["Y"].cast("double"))
    df = df.withColumn("Z", df["Z"].cast("double"))
    df = df.withColumn("TremorGA", df["TremorGA"].cast("double"))
    df = df.withColumn("BradykinesiaGA", df["BradykinesiaGA"].cast("double"))
    df = df.withColumn("DyskinesiaGA", df["DyskinesiaGA"].cast("double"))
    df = df.withColumn("TSStart", df["TSStart"].cast("timestamp"))
    df = df.withColumn("TSEnd", df["TSEnd"].cast("timestamp"))
    df = df.withColumn("interval_start", ((ceil(unix_timestamp(df["TSStart"]).cast("long")))%10**8)) 
    df = df.withColumn("interval_end", ((ceil(unix_timestamp(df["TSEnd"]).cast("long")))%10**8)) 
    df = df.withColumn("temp", utils_function_spark.find_milisec_udf('TS')) 
    df = df.withColumn("interval", (((unix_timestamp(df["TS"]).cast("long"))))) 
    df = df.withColumn("interval", utils_function_spark.merge_integers_udf('interval', 'temp'))
    df = df.withColumn("key", utils_function_spark.give_my_key_udf("interval_start", "interval_end", 'SubjectId') ) 
    df = df.withColumn("key", df["key"].cast("double"))
    
    return df
示例#3
0
    def generate_reports(self, from_time, to_time):
        ecg_data = self.get_monitoring_data(from_time, to_time)

        findspark.init()
        spark = SparkSession.builder.appName("ECGLearning").master(
            "local[*]").getOrCreate()
        spark.conf.set('spark.sql.session.timeZone', 'Asia/Kolkata')
        os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True)

        # Create data frame
        ecg_data_rdd = spark.sparkContext.parallelize(ecg_data)
        schema = StructType([
            StructField('time', IntegerType(), True),
            StructField('tps', StringType(), True)
        ])
        tps_df = spark.createDataFrame(ecg_data_rdd, schema)
        tps_df = tps_df.withColumn("tps", tps_df["tps"].cast("float"))
        tps_df = tps_df.withColumn("tps", F.ceil(tps_df["tps"]))
        tps_df = tps_df.withColumn(
            "time", F.from_unixtime(tps_df["time"], "yyyy/MM/dd HH:mm:ss"))

        # Downloading the current file from blob container
        get_data_from_blob(
            Path(self.write_path).joinpath('public', self.csv_file_name))
        current_blob_df = spark.read.csv(os.path.join(self.write_path,
                                                      'public',
                                                      self.csv_file_name),
                                         header=True)
        current_blob_df = current_blob_df.withColumn(
            "tps", current_blob_df["tps"].cast("int"))
        current_blob_df = current_blob_df.union(tps_df)
        current_blob_df = current_blob_df.dropDuplicates(["time"])
        current_blob_df = current_blob_df.sort("time")

        # removing the first day's data on 7 days data
        current_blob_df = self.remove_last_day(current_blob_df)

        os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True)
        current_blob_df.toPandas().to_csv(os.path.join(self.write_path,
                                                       'public',
                                                       self.csv_file_name),
                                          index=False)
        create_json(
            os.path.join(self.write_path, 'public', self.csv_file_name), True)

        # Uploading updated data to Azure blob container
        write_data_to_blob(self.write_path,
                           os.path.join('public', self.csv_file_name))
        write_data_to_blob(self.write_path,
                           os.path.join('public', self.json_file_name))

        spark.stop()
示例#4
0
def round_up_cents(df: DataFrame,
                   column: str,
                   precision: int = 2) -> DataFrame:
    """
    Rounds up single column to a given precision and returns a dataframe

        Parameters:
            df (DataFrame): A pyspark DataFrame
            column (str): The column that the transformation should be applied to
            precision (int): digits after the decimal point the mapping will round to (default: 2)
    """
    return df.withColumn(column,
                         ceil(df[column] * 10**precision) / 10**precision)
示例#5
0
def add_index_eventTs(date_1, df, index_hrs_day):
    ## date to ts : midnight :  timegm doing as default
    date_1_ts = calendar.timegm(date_1.timetuple())
    ## time diff from midnight 00:00:00 : in secs
    df = df.withColumn("time_diff_mid", df.eventTs - F.lit(date_1_ts))
    # adjustment ( for data dump inaccuracies)
    # filetring out ping with hour index not in "1-86400", between includes both 
    df = df.filter(F.col("time_diff_mid").between(1,86400))
    # df = df.repartition(200) : keeping off for reducing shuffles
    ## fix index level , ex:  4 hours : 3600*4 secs
    index_sec = 3600 *  index_hrs_day
    #creating index col  : 1 to 6
    df = df.withColumn("index", F.ceil(df.time_diff_mid / F.lit(index_sec)))
    df = df.drop("time_diff_mid")
    return df
示例#6
0
def grouped_map_pandas_udf(spark):
    @pandas_udf(
        returnType="id long, v double", functionType=PandasUDFType.GROUPED_MAP
    )  # functionType: an enum value in pyspark.sql.functions.PandasUDFType, Default SCALAR
    def subtract_mean(pdf):
        v = pdf.v  # pdf is a pandas.DataFrame
        return pdf.assign(v=v - v.mean())  # 添加新的列或者覆盖原有的列,这里需要理解一下

    @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # id,v是自定义的列名
    def mean_udf(key, pdf):
        # key is a tuple of one numpy.int64, which is the value of 'id' for the current group
        return pd.DataFrame([key + (pdf['v'].mean(), )])

    @pandas_udf("id long, `ceil(v / 2)` long, v double",
                PandasUDFType.GROUPED_MAP)
    def sum_udf(key, pdf):
        # key is a tuple of two numpy.int64s, which is the values of 'id' and 'ceil(df.v / 2)' for the current group
        return pd.DataFrame([key + (pdf['v'].sum(), )])

    df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0),
                                (2, 10.0)], ("id", "v"))
    df.groupBy("id").apply(subtract_mean).show()
    # +---+----+
    # | id|   v|
    # +---+----+
    # |  1|-0.5|
    # |  1| 0.5|
    # |  2|-3.0|
    # |  2|-1.0|
    # |  2| 4.0|
    # +---+----+

    df.groupBy('id').apply(mean_udf).show()
    # +---+---+
    # | id|  v|
    # +---+---+
    # |  1|1.5|
    # |  2|6.0|
    # +---+---+

    df.groupBy('id', ceil(df['v'] /
                          2)).apply(sum_udf).show()  # ceil返回大于或者等于指定表达式的最小整数
示例#7
0
def get_filtered_by_week(data: DataFrame) -> DataFrame:
    """
    Method transforms periods "start_date - end_date" to year and week number of year

    source:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-01-01|2018-01-09|
    +---+----------+----------+

    result:
    +---+----------+----------+
    |key|      year|  week_num|
    +---+----------+----------+
    |  5|      2018|         1|
    |  5|      2018|         2|
    +---+----------+----------+
    """

    max_week_number = 53
    transformed_data = data \
        .withColumn('start_week', F.weekofyear('start_date')) \
        .withColumn('weeks_diff', F.ceil(F.datediff(F.col('end_date'),
                                                    F.col('start_date')) / 7)) \
        .withColumn("year", F.year("start_date")) \
        .withColumn("repeat", F.expr("split(repeat(',', weeks_diff), ',')")) \
        .select("*", F.posexplode("repeat").alias("week_add", "val")) \
        .withColumn('total_week_num', F.col('start_week') + F.col('week_add')) \
        .withColumn('add_year', (F.col('total_week_num') / max_week_number).cast(IntegerType())) \
        .withColumn('total_week_num', F.col('total_week_num') - (max_week_number * F.col('add_year'))) \
        .withColumn('week_num',
                    F.when(F.col('total_week_num') == 0, 1)
                    .otherwise(F.col('total_week_num'))) \
        .withColumn('year', F.col('year') + F.col('add_year')) \
        .drop('start_date', 'end_date', 'week_add', 'repeat',
              'val', 'date', 'add_year', 'weeks_diff', 'total_week_num') \
        .dropDuplicates()

    return transformed_data
示例#8
0
 def _estimate_workload(sketch_l, sketch_r):
     for i in range(len(hash_udfs)):
         sketch_l = sketch_l.withColumnRenamed(str(i), 'l' + str(i))
         sketch_r = sketch_r.withColumnRenamed(str(i), 'r' + str(i))
     sketch = sketch_l.join(sketch_r, 'hash_val')
     sketch.cache()
     sum_cols = [
         F.sum((F.col('l' + str(i)) * F.col('r' + str(i))))
         for i in range(len(hash_udfs))
     ]
     total_load = sketch.select(sum_cols).first()
     workloads = [l for l in enumerate(total_load)]
     workloads.sort(key=lambda x: x[1])
     median_idx = workloads[int(len(hash_udfs) / 2)][0]
     median_load_total = workloads[int(len(hash_udfs) / 2)][1]
     avg_load = int(median_load_total / n_workers)
     median_load = sketch.select('hash_val',
                                 (F.col('l' + str(median_idx)) * F.col('r' + str(median_idx))).alias('load')) \
         .withColumn('avg_load', F.lit(avg_load)) \
         .withColumn('n_part', F.ceil(F.col('load') / F.col('avg_load'))) \
         .select('hash_val', 'n_part')
     return hash_udfs[median_idx], median_load
示例#9
0
def run_dm_dc_order(output_str, info_str, stock_date, run_date, start_date,
                    end_date, log_file, sqlc):
    print_output(
        f"Load DM items and DC for DM that starts between {start_date} and {end_date}",
        log_file)

    dm_item_dc_sql = \
        """
        SELECT distinct ndt.dm_theme_id,
            ndt.theme_start_date,
            ndt.theme_end_date,
            del.npp,
            del.ppp,
            del.ppp_start_date,
            del.ppp_end_date,
            del.dept_code,
            dcid.holding_code,
            dcid.risk_item_unilever,
            dcid.primary_ds_supplier as ds_supplier_code,
            cast(dcid.qty_per_unit as int) as pcb,
            dcid.rotation,
            dcid.qty_per_unit,
            icis.item_id,
            icis.sub_id,
            icis.item_code,
            icis.sub_code,
            icis.date_key AS run_date,
            fdo.first_order_date AS past_result
        FROM vartefact.forecast_nsa_dm_extract_log del
        JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id
        JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity
        JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}'
            AND del.item_code = CONCAT (
                icis.dept_code,
                icis.item_code
                )
            AND del.sub_code = icis.sub_code
            AND del.dept_code = icis.dept_code
        JOIN vartefact.forecast_dc_item_details dcid ON dcid.item_code =icis.item_code
            AND dcid.sub_code = icis.sub_code
            AND dcid.dept_code = icis.dept_code
            AND dcid.rotation != 'X'
            AND dcid.dc_status != 'Stop'
            AND dcid.item_type not in ('New','Company Purchase','Seasonal')
        JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code
            AND dcid.dept_code = id.dept_code
            AND dcid.item_code = id.item_code
            AND dcid.sub_code = id.sub_code
        LEFT JOIN vartefact.forecast_dm_dc_orders fdo ON ndt.dm_theme_id = fdo.dm_theme_id
            AND icis.dept_code = fdo.dept_code
            AND icis.item_code = fdo.item_code
            AND icis.sub_code = fdo.sub_code
        WHERE del.extract_order >= 40
            AND del.date_key = '{1}'
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd')
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd')
        """.replace("\n", " ")

    dm_item_dc_sql = dm_item_dc_sql.format(stock_date.strftime("%Y%m%d"),
                                           run_date.strftime("%Y%m%d"),
                                           start_date.strftime("%Y%m%d"),
                                           end_date.strftime("%Y%m%d"))

    dm_item_dc_df = sqlc.sql(dm_item_dc_sql)

    # # Exclude the DM that already have orders

    print_output("Exclude the DM that already have orders", log_file)

    dm_item_dc_df = dm_item_dc_df.filter("past_result is null")

    output_line = f"After filtering already calculated DM {dm_item_dc_df.count()}"

    print_output(output_line, log_file)
    output_str = output_str + output_line + ","

    # # Only consider the nearest DM

    first_dc_dm = dm_item_dc_df. \
        groupBy(['item_id', 'sub_id']). \
        agg(F.min("theme_start_date").alias("theme_start_date"))

    dm_item_dc_df = dm_item_dc_df.join(
        first_dc_dm, ['item_id', 'sub_id', 'theme_start_date'])

    dm_item_dc_cnt = dm_item_dc_df.count()

    print_output(f"After getting only first DM in DC {dm_item_dc_cnt}",
                 log_file)
    output_str = output_str + f"After getting only first DM in DC {dm_item_dc_cnt}," + ","

    if dm_item_dc_cnt == 0:
        run_date_str = run_date.strftime("%Y%m%d")
        print_output(
            f"skip date {run_date_str} cause no active order opportunity for today",
            log_file)
        info_str = info_str + f"Job Finish:{get_current_time()},"
        info_str = info_str + f"skip date {run_date_str} cause no active order opportunity for today"

        return

    output_line = f"Number of item that will have DM order in DC {dm_item_dc_df.count()}"
    print_output(output_line, log_file)
    output_str = output_str + output_line + ","

    dm_item_dc_df.cache()

    dm_item_dc_df.createOrReplaceTempView("dm_item_dc")

    # +
    dc_order_sql = \
        """
        SELECT distinct dis.item_id,
            dis.sub_id,
            ord.date_key AS first_order_date,
            dev.date_key AS first_delivery_date
        FROM dm_item_dc dis
        JOIN vartefact.forecast_dc_order_delivery_mapping dodm
            ON dis.holding_code = dodm.con_holding
            AND dis.risk_item_unilever = dodm.risk_item_unilever
        JOIN vartefact.forecast_calendar ord
            ON ord.date_key = dodm.order_date
        JOIN vartefact.forecast_calendar dev
            ON dev.weekday_short = dodm.delivery_weekday and dev.week_index = ord.week_index + dodm.week_shift
        WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd')
            AND dev.date_key <= '{0}'
            AND dis.rotation != 'X'
        """.replace("\n", " ")

    dc_order_sql = dc_order_sql.format(end_date.strftime("%Y%m%d"))

    # +
    dc_order_deliver_df = sqlc.sql(dc_order_sql)

    dc_first_order_df = dc_order_deliver_df.groupBy(['item_id', 'sub_id']). \
        agg(F.min("first_order_date").alias("first_order_date"))

    dc_first_order_deliver_df = dc_order_deliver_df \
        .select(['item_id', 'sub_id', 'first_order_date', 'first_delivery_date']) \
        .join(dc_first_order_df, ['item_id', 'sub_id', 'first_order_date'])
    # -

    dm_item_dc_order_df = dm_item_dc_df \
        .join(dc_first_order_deliver_df, \
              ['item_id', 'sub_id'])

    dm_item_dc_order_df.createOrReplaceTempView("dm_item_dc_order")

    dm_store_to_dc_sql = \
        """
        select 
          dm.item_id,
          dm.sub_id,
          dm.holding_code,
          dm.theme_start_date,
          dm.theme_end_date,
          dm.npp,
          dm.ppp,
          dm.ppp_start_date,
          dm.ppp_end_date,
          dm.dept_code,
          dm.item_code,
          dm.sub_code,
          dm.pcb,
          dm.ds_supplier_code,
          dm.rotation,
          dm.run_date,
          dm.first_order_date,
          dm.first_delivery_date,
          sum(sod.regular_sales_before_dm) as regular_sales_before_dm,
          sum(sod.four_weeks_after_dm) as four_weeks_after_dm,
          sum(sod.dm_sales) as dm_sales,
          sum(sod.order_qty) as dm_order_qty_without_pcb,
          dm.dm_theme_id
        FROM 
            vartefact.forecast_dm_orders sod
        JOIN dm_item_dc_order dm
            on sod.item_id = dm.item_id
            and sod.sub_id = dm.sub_id
            and sod.dm_theme_id = dm.dm_theme_id
        GROUP BY
          dm.dm_theme_id,
          dm.item_id,
          dm.sub_id,
          dm.holding_code,
          dm.theme_start_date,
          dm.theme_end_date,
          dm.npp,
          dm.ppp,
          dm.ppp_start_date,
          dm.ppp_end_date,
          dm.dept_code,
          dm.item_code,
          dm.sub_code,
          dm.pcb,
          dm.ds_supplier_code,
          dm.rotation,
          dm.run_date,
          dm.first_order_date,
          dm.first_delivery_date
        """.replace("\n", " ")

    dm_dc_order = sqlc.sql(dm_store_to_dc_sql)

    dm_dc_pcb = dm_dc_order \
        .withColumn("dm_order_qty",
                    F.when(dm_dc_order.dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_dc_order.dm_order_qty_without_pcb / dm_dc_order.pcb) * dm_dc_order.pcb)
                    .otherwise(int(0)))

    dm_dc_pcb.createOrReplaceTempView("dm_dc_final")

    output_line = f"Number of DM DC orders {dm_dc_pcb.count()}"
    print_output(output_line, log_file)
    output_str = output_str + output_line

    print_output("Write DC order to datalake", log_file)

    dm_dc_sql = \
        """
        INSERT INTO vartefact.forecast_dm_dc_orders
        PARTITION (dm_theme_id)
        SELECT 
          item_id,
          sub_id,
          holding_code,
          theme_start_date,
          theme_end_date,
          npp,
          ppp,
          ppp_start_date,
          ppp_end_date,
          dept_code,
          item_code,
          sub_code,
          pcb,
          ds_supplier_code,
          rotation,
          run_date,
          first_order_date,
          first_delivery_date,
          regular_sales_before_dm,
          four_weeks_after_dm,
          dm_sales,
          dm_order_qty,
          dm_order_qty_without_pcb,
          dm_theme_id
        FROM dm_dc_final
        """.replace("\n", " ")

    # +
    sqlc.sql(dm_dc_sql)

    sqlc.sql("refresh table vartefact.forecast_dm_dc_orders")

    info_str = info_str + f"Job Finish:{get_current_time()}"
示例#10
0
def run_dm_store_order(output_str, info_str, stock_date, run_date, start_date,
                       end_date, log_file, sqlc):

    print_output(
        f"Load DM items and stores for DM that starts between {start_date} and {end_date}",
        log_file)

    dm_item_store_sql = \
        """
        SELECT distinct ndt.dm_theme_id,
            ndt.theme_start_date,
            ndt.theme_end_date,
            del.npp,
            del.ppp,
            del.ppp_start_date,
            del.ppp_end_date,
            del.city_code,
            id.store_code,
            del.dept_code,
            id.con_holding,
            id.risk_item_unilever,
            cast(id.qty_per_unit as int) as pcb,
            id.dc_supplier_code,
            id.ds_supplier_code,
            id.rotation,
            icis.item_id,
            icis.sub_id,
            icis.item_code,
            icis.sub_code,
            icis.date_key AS run_date,
            fdo.first_order_date AS past_result
        FROM vartefact.forecast_nsa_dm_extract_log del
        JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id
        JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity
        JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code
            AND del.item_code = CONCAT (
                id.dept_code,
                id.item_code
                )
            AND del.sub_code = id.sub_code
            AND del.dept_code = id.dept_code
            AND id.store_status != 'Stop'
            AND id.item_type not in ('New','Company Purchase','Seasonal')
        JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}'
            AND id.item_code = icis.item_code
            AND id.sub_code = icis.sub_code
            AND id.dept_code = icis.dept_code
            AND id.store_code = icis.store_code
        LEFT JOIN vartefact.forecast_dm_orders fdo ON ndt.dm_theme_id = fdo.dm_theme_id
            AND icis.dept_code = fdo.dept_code
            AND icis.item_code = fdo.item_code
            AND icis.sub_code = fdo.sub_code
            AND icis.store_code = fdo.store_code
        WHERE del.extract_order >= 40
            AND del.date_key = '{1}'
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd')
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd')
        """.replace("\n", " ")

    dm_item_store_sql = dm_item_store_sql.format(stock_date.strftime("%Y%m%d"),
                                                 run_date.strftime("%Y%m%d"),
                                                 start_date.strftime("%Y%m%d"),
                                                 end_date.strftime("%Y%m%d"))

    # # Exclude the DM that already have orders

    dm_item_store_df = sqlc.sql(dm_item_store_sql)

    print_output(
        f"Number of DM item stores in date range {dm_item_store_df.count()}",
        log_file)

    print_output("Exclude the DM that already have orders", log_file)

    dm_item_store_df = dm_item_store_df.filter("past_result is null")

    output_line = f"After filtering already calculated DM {dm_item_store_df.count()}"

    print_output(output_line, log_file)
    output_str = output_str + output_line + ","

    # # Only consider the nearest DM

    first_dm = dm_item_store_df. \
        groupBy(['item_id', 'sub_id', 'store_code']). \
        agg(F.min("theme_start_date").alias("theme_start_date"))

    dm_item_store_df = dm_item_store_df.join(
        first_dm, ['item_id', 'sub_id', 'store_code', 'theme_start_date'])

    dm_item_store_cnt = dm_item_store_df.count()

    print_output(f"After getting only first DM {dm_item_store_cnt}", log_file)
    output_str = output_str + f"After getting only first DM {dm_item_store_cnt}," + ","

    if dm_item_store_cnt == 0:
        run_date_str = run_date.strftime("%Y%m%d")
        print_output(
            f"skip date {run_date_str} cause no active order opportunity for today",
            log_file)
        info_str = info_str + f"Job Finish:{get_current_time()},"
        info_str = info_str + f"skip date {run_date_str} cause no active order opportunity for today"

        return

    dm_item_store_df.write.mode("overwrite").format("parquet").saveAsTable(
        "vartefact.tmp_dm_item_store")

    dm_item_store_df.createOrReplaceTempView("dm_item_store")

    # # The first order day within PPP period
    print_output("Get first order day within PPP period", log_file)
    onstock_order_sql = \
        """
        SELECT dis.item_id,
            dis.sub_id,
            dis.store_code,
            ord.date_key AS first_order_date,
            dev.date_key AS first_delivery_date
        FROM dm_item_store dis
        JOIN vartefact.forecast_onstock_order_delivery_mapping mp ON dis.dept_code = mp.dept_code
            AND dis.rotation = mp.rotation
            AND dis.store_code = mp.store_code
        JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = mp.order_iso_weekday
        JOIN vartefact.forecast_calendar dev ON dev.iso_weekday = mp.delivery_iso_weekday
            AND dev.week_index = ord.week_index + mp.week_shift
        WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd')
            AND to_timestamp(dev.date_key, 'yyyyMMdd') >= date_add(to_timestamp(dis.theme_start_date, 'yyyy-MM-dd'), -7)
            AND dev.date_key <= '{0}'
        """.replace("\n", " ")

    onstock_order_sql = onstock_order_sql.format(end_date.strftime("%Y%m%d"))

    onstock_order_deliver_df = sqlc.sql(onstock_order_sql)

    xdock_order_sql = \
        """
        SELECT dis.item_id,
            dis.sub_id,
            dis.store_code,
            ord.date_key AS first_order_date,
            date_format(
                date_add(
                    to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time
                    ),
                'yyyyMMdd'
            ) AS first_delivery_date
        FROM dm_item_store dis
        JOIN vartefact.forecast_xdock_order_mapping xo ON dis.item_code = xo.item_code
            AND dis.sub_code = xo.sub_code
            AND dis.dept_code = xo.dept_code
            AND dis.store_code = xo.store_code
        JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = xo.order_iso_weekday
        JOIN vartefact.forecast_dc_order_delivery_mapping dodm ON dodm.con_holding = dis.con_holding
            AND dodm.order_date = ord.date_key
            AND dis.risk_item_unilever = dodm.risk_item_unilever
        WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd')
            AND date_add(to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time)  <= to_timestamp('{0}', 'yyyyMMdd')
        """.replace("\n", " ")

    xdock_order_sql = xdock_order_sql.format(end_date.strftime("%Y%m%d"))

    xdock_order_deliver_df = sqlc.sql(xdock_order_sql)

    order_deliver_df = onstock_order_deliver_df.union(xdock_order_deliver_df)

    order_deliver_df.cache()

    first_order_df = order_deliver_df.groupBy(['item_id', 'sub_id', 'store_code']). \
        agg(F.min("first_order_date").alias("first_order_date"))

    first_order_df.cache()

    first_order_deliver_df = order_deliver_df \
        .join(first_order_df, ['item_id', 'sub_id', 'store_code', 'first_order_date'])
    # -

    dm_item_store_order_df = dm_item_store_df \
        .join(first_order_deliver_df, ['item_id', 'sub_id', 'store_code'])

    dm_item_store_order_df.createOrReplaceTempView("dm_item_store_order")

    output_line = f"Number of item stores that will have DM {dm_item_store_order_df.count()}"
    print_output(output_line, log_file)
    output_str = output_str + output_line + ","

    # # Get DM sales prediction

    dm_sales_predict_sql = \
        """
        select 
          dm.*,
          cast(coalesce(pred.sales_prediction, '0', pred.sales_prediction) as double) as dm_sales,
          coalesce(pred.sales_prediction, 'no', 'yes') as having_dm_prediction
       from 
            dm_item_store_order dm
        left join vartefact.forecast_weekly_dm_view pred
            on cast(pred.item_id as int) = dm.item_id
            and cast(pred.sub_id as int) = dm.sub_id
            and cast(pred.current_dm_theme_id as int) = dm.dm_theme_id
            and pred.store_code = dm.store_code
        """.replace("\n", " ")

    dm_prediction = sqlc.sql(dm_sales_predict_sql)

    dm_prediction.createOrReplaceTempView("dm_prediction")

    dm_prediction.filter("having_dm_prediction = 'no' ") \
        .write.mode("overwrite").format("parquet") \
        .saveAsTable("vartefact.forecast_no_dm_prediction")

    output_line = f"Number of DM sales prediction {dm_prediction.count()}"
    print_output(output_line, log_file)
    output_str = output_str + output_line + ","

    print_output("Regular sales before DM", log_file)

    # # Regular sales from first order day to DM start day
    dm_regular_sales_sql = \
        """
        SELECT dp.item_id,
            dp.sub_id,
            dp.store_code,
            dp.dm_theme_id,
            case when
              fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A'
            then 0
            when
              fcst.daily_sales_prediction_original < 0
            then 0
            else fcst.daily_sales_prediction_original 
            end AS sales_prediction
        FROM vartefact.t_forecast_daily_sales_prediction fcst
        JOIN dm_prediction dp ON fcst.item_id = dp.item_id
            AND fcst.sub_id = dp.sub_id
            AND fcst.store_code = dp.store_code
            AND fcst.date_key > dp.first_delivery_date
            AND to_timestamp(fcst.date_key, 'yyyyMMdd') < to_timestamp(dp.theme_start_date, 'yyyy-MM-dd')
        """.replace("\n", " ")

    dm_regular_sales = sqlc.sql(dm_regular_sales_sql)
    # -

    agg_dm_regular_sales = dm_regular_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \
        agg(F.sum("sales_prediction").alias("regular_sales_before_dm"))

    dm_with_regular = dm_prediction.join(
        agg_dm_regular_sales,
        ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left")

    # # For ppp <= 90% npp, get 4 weeks after sales for ROTATION A items
    print_output("DM PPP logic", log_file)

    after_fourweek_sql = \
        """
        SELECT dp.item_id,
            dp.sub_id,
            dp.store_code,
            dp.dm_theme_id,
            case when
              fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A'
            then 0
            when
              fcst.daily_sales_prediction_original < 0
            then 0
            else fcst.daily_sales_prediction_original 
            end AS sales_prediction
        FROM dm_prediction dp
        JOIN vartefact.t_forecast_daily_sales_prediction fcst ON fcst.item_id = dp.item_id
            AND fcst.sub_id = dp.sub_id
            AND fcst.store_code = dp.store_code
            AND to_timestamp(fcst.date_key, 'yyyyMMdd') > to_timestamp(dp.theme_end_date, 'yyyy-MM-dd')
            AND to_timestamp(fcst.date_key, 'yyyyMMdd') < date_add(to_timestamp(dp.theme_end_date, 'yyyy-MM-dd'), 28)
        WHERE dp.rotation = 'A'
            AND dp.ppp <= dp.npp * 0.9
        """.replace("\n", " ")

    after_fourweek_sales = sqlc.sql(
        after_fourweek_sql.format(run_date.strftime("%Y%m%d")))

    agg_after_fourweek_sales = after_fourweek_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \
        agg(F.sum("sales_prediction").alias("four_weeks_after_dm"))

    output_line = f"Number of DM having PPP {agg_after_fourweek_sales.count()}"
    print_output(output_line, log_file)
    output_str = output_str + output_line + ","

    dm_with_fourweek = dm_with_regular.join(
        agg_after_fourweek_sales,
        ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left")

    # # Fill NA

    dm_with_fourweek = dm_with_fourweek.na.fill(0)
    dm_with_fourweek.cache()

    output_line = f"Number of DM store orders {dm_with_fourweek.count()}"
    print_output(output_line, log_file)
    output_str = output_str + output_line

    # # Final calculation

    print_output("Calculate order quantity", log_file)
    dm_final = dm_with_fourweek.withColumn(
        "dm_order_qty_without_pcb", dm_with_fourweek.regular_sales_before_dm +
        dm_with_fourweek.four_weeks_after_dm + dm_with_fourweek.dm_sales)

    dm_final = dm_final \
        .withColumn("first_dm_order_qty_without_pcb",
                    F.when(dm_final.rotation != 'X', 0.75 * dm_final.dm_order_qty_without_pcb)
                    .otherwise(dm_final.dm_order_qty_without_pcb))

    dm_final = dm_final \
        .withColumn("first_dm_order_qty",
                    F.when(dm_final.first_dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_final.first_dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb)
                    .otherwise(int(0)))

    dm_final_pcb = dm_final \
        .withColumn("dm_order_qty",
                    F.when(dm_final.dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_final.dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb)
                    .otherwise(int(0)))

    dm_final_pcb.createOrReplaceTempView("dm_final_pcb")

    print_output("Write store order to datalake", log_file)
    dm_sql = \
        """
        INSERT INTO vartefact.forecast_dm_orders
        PARTITION (dm_theme_id)
        SELECT 
            item_id,
            sub_id,
            store_code,
            con_holding,
            theme_start_date,
            theme_end_date,
            npp,
            ppp,
            ppp_start_date,
            ppp_end_date,
            city_code,
            dept_code,
            item_code,
            sub_code,
            pcb,
            dc_supplier_code,
            ds_supplier_code,
            rotation,
            run_date,
            first_order_date,
            first_delivery_date,
            regular_sales_before_dm,
            four_weeks_after_dm,
            dm_sales,
            dm_order_qty,
            first_dm_order_qty,
            dm_order_qty_without_pcb,
            dm_theme_id
        FROM dm_final_pcb
        """.replace("\n", " ")

    if dm_item_store_cnt > 0:
        sqlc.sql(dm_sql)

    sqlc.sql("refresh table vartefact.forecast_dm_orders")

    print_output("Finish writing store order to datalake", log_file)
示例#11
0
文件: P4_spark.py 项目: Rymond3/CLO
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, avg, ceil
import string

spark = SparkSession.builder.master("local[2]").appName(
    "Ratings").getOrCreate()

df = spark.read.csv("ratings.csv")

dfsel = df.select(col("_c1").alias("id"), col("_c2").alias("rating"))

df_avg = dfsel.groupBy("id").agg(avg(col("rating")))

df_final = df_avg.select(
    ceil(col("avg(rating)")).alias("RatingRange"),
    col("id")).sort("RatingRange", ascending=True).rdd

rdd = df_final.map(lambda x: ("Range " + str(x["RatingRange"]), x["id"]))

rdd.saveAsTextFile("output4.txt")
示例#12
0
def compile_ceil(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.ceil(src_column)
示例#13
0
    def process_data(self, study_dt):
        ##############################################################################
        # DECLARE VARIABLES
        ##############################################################################

        dt_range = self.study_dates(study_dt)
        dt = dt_range
        s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s1_initial_bucket_depth = 'cuebiq/daily-feed/US/'
        s1_bucket_output = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/'
        anchor_dist = 430
        time_thresh = 28800
        part_num = 9

        gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True)
        ])

        s2_gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True),
            StructField("row_number", IntegerType(), True)
        ])

        ##############################################################################
        # WINDOWS
        ##############################################################################
        w = Window().partitionBy('device_id').orderBy('utc_timestamp')
        l = Window().partitionBy('device_id',
                                 'lin_grp').orderBy('utc_timestamp')
        w2 = Window().partitionBy('device_id').orderBy('row_number')

        ##############################################################################
        # BEGIN DAILY ITERATION
        ##############################################################################

        print("Reading in files for {}".format(str(dt['study_dt'])[:10]))
        print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name,
                                                 s1_initial_bucket_depth,
                                                 dt['s3_before'],
                                                 dt['s3_study_dt'],
                                                 dt['s3_after']))
        print("")

        #################################################################################################
        # START STEP 1
        #################################################################################################
        df1 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_before'] +"/*.gz") # the day before

        df2 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_study_dt'] +"/*.gz") # actual study date

        df3 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_after'] +"/*.gz") # the day after

        # Union data from three inputs into 1 dataframe
        df = df1.union(df2).union(df3) \
            .repartition(part_num, 'device_id')

        del df1
        del df2
        del df3

        ##############################################################################
        # FILTER INITIAL JUNK RECORDS
        # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range
        ##############################################################################
        df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \
                    .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \
                            & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \
                            & (df['utc_timestamp'] + df['tz_offset']) \
                                    .between(dt['utc_study_dt'], dt['utc_after'])) \
                    .dropDuplicates(['utc_timestamp','device_id'])

        ##############################################################################
        # EXCESSIVE SPEED REMOVAL
        ##############################################################################
        df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \
                            lead(df['longitude'],1).over(w))) \
            .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \
            .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \
            .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \
            .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \
            .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \
            .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \
                        | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \
                        & ((col('speed_from') < 600) & (col('speed_to') < 600)) \
                        & ((col('speed_from') < 20) | (col('speed_to') < 20))) \
            .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        ##############################################################################
        # LINEAR TRAVEL PING REMOVAL
        # Break pings out into groups of 4 to measure the linear distance
        ##############################################################################
        #Assign a record number and linear grouping and lead distance
        df = df.withColumn('RecordNum',row_number().over(w)) \
            .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \
            .withColumn('dist_to', distance(df['latitude'], df['longitude'], \
                lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters'))

        # Create aggregated table for linear groupings
        expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \
            py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')]

        dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr)

        dfl_grp.createOrReplaceTempView('dfl_grp')
        df.createOrReplaceTempView('dfl')

        # Grab just the first and last records in each linear grouping and append aggregated info
        dfls = spark.sql(
            "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.min_utc_timestamp \
                    UNION ALL \
                    SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.max_utc_timestamp")

        # Measure the distance between first and last in each linear grouping and compare to sum distance of all points
        # Only keep groups that meet criteria for being straight-line
        df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \
                    lead(dfls['latitude'],1).over(l), \
                    lead(dfls['longitude'],1).over(l), 'meters')) \
                .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \
                .na.drop(subset=['strt_dist']) \
                .filter((dfls['min_dist'] > 0)  \
                    & (col('strt_dist').between(150, 2000)) \
                    & (dfls['cnt'] == 4) \
                    & (col('lin') >= .99825)) \
                .select('device_id','lin_grp', 'lin')

        # Outer join main dataframe to linears groups to filter non-linear pings
        df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \
            .filter(col('lin').isNull()) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        del dfl_grp
        del dfls
        del df_j

        #######################################
        # CHAIN
        # Calculating the dynamic chain threshold to find proximate ping relationships
        #######################################
        df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \
            .withColumn('chain', when((distance(df['latitude'], df['longitude'], \
                            lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1)
                            .when((distance(df['latitude'], df['longitude'], \
                            lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \
            .filter(col('chain') == 1) \
            .withColumn('row_number', row_number().over(w)) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \
            .persist()

        df \
            .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \
            .write \
            .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        ##############################################################################################
        # START STEP 2
        ##############################################################################################

        print('Begin micro-clustering')

        # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold
        self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \
                        .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \
                                .when(distance(df['latitude'], df['longitude'], \
                                                lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \
                                            >= anchor_dist, col('tz_timestamp')) \
                                .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \
                        .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \
                        .repartition(part_num, 'device_id') \
                        .persist()

        print('df_dist starting count = {}'.format(
            self.df_dist.count()))  # Materialize table for caching

        df.unpersist()
        del df

        #####################################################################################################
        # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing
        # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration
        ######################################################################################################
        df_anchor1 = self.anchor_func(3, 3)
        df_anchor2 = self.anchor_func(5, 5)
        df_anchor3 = self.anchor_func(12, 6)
        df_anchor4 = self.anchor_func(20, 5)
        df_anchor5 = self.anchor_func(30, 5)
        df_anchor6 = self.anchor_func(50, 5)
        df_anchor7 = self.anchor_func(80, 5, 1000000)
        df_anchor8 = self.anchor_func(1000, 5, 1000000)

        ##################################################################################################
        # Collect remaining pings to driver for Python analysis
        print('collect remaining pings')
        anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \
                                                'longitude':row[4], 'anchor':row[7]}).collect()

        # Sort elements in list by device_id and timestamp
        anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp'))

        # Python analysis on driver of final remaining pings
        print('iterate through remaining pings on driver')
        anchor_dr = []

        for r in anchor_list:
            if r['anchor'] is not None:
                anchor_dr.append(r)

            else:
                if anchor_dr[-1]['device_id'] == r['device_id']:
                    if distance_dr(r['latitude'],r['longitude'], \
                                anchor_dr[-1]['latitude'], \
                                anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \
                                & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh:
                        anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \
                                        'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \
                                        'anchor':anchor_dr[-1]['anchor']})

                    else:
                        r['anchor'] = r['timestamp']
                        anchor_dr.append(r)

        # Condense result table for dataframe distribution
        print('generate driver anchor table')
        new_anchor = []
        for r in anchor_dr:
            new_anchor.append([r['timestamp'], r['device_id'], r['anchor']])

        # Bring driver results back into a distributed dataframe and join results
        print('disperse driver anchor table back to cluster')
        new_anchor_schema = StructType([
            StructField('tz_timestamp', IntegerType(), True),
            StructField('device_id', StringType(), True),
            StructField('anchor', IntegerType(), True)
        ])

        df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \
                        .repartition(part_num, 'device_id')

        # Join remaining anchors to main analysis table
        self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \
                                'accuracy','row_number') \
                            .join(df_anchor_dr,['tz_timestamp','device_id']) \

        # Union all anchor tables together and sort
        print('finalizing anchor results into central table')
        df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \
                            .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \
                            .repartition(part_num,'device_id') \
                            .persist()

        self.df_dist.unpersist()

        #######################################################################################
        # Calculate centroids
        #######################################################################################
        print('start calculating centroids')
        # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings
        df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \
                                                                        py.count(col('tz_timestamp')).alias('cnt')]) \
                                    .withColumn('max_acc_1', col('max_accuracy') + 1) \
                                    .filter(col('cnt') > 1) \
                                    .select('device_id','anchor','max_acc_1','cnt')

        # Calculate the nominator for each micro-cluster
        df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \
                                        .withColumn('nom',col('max_acc_1') - col('accuracy'))

        df_denom = df_anchors_fnl.groupBy(
            'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')])


        df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \
                            .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \
                            .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \
                            .withColumn('lon', df_anchors_fnl['longitude'] * col('weight'))


        expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \
                    py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \
                    py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \
                    py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \
                    py.avg(col('accuracy')).alias('avg_accuracy')]

        df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \
                                .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \
                                .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \
                                .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \
                                .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \
                                .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \
                                .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \
                                .filter(col('dwell_seconds') > 1) \
                                .select('device_id','os','start_tm','end_tm', \
                                        'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \
                                        col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy')


        df_micro \
                .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \
                .write \
                .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        df_anchors_fnl.unpersist()

        return
						 .addGrid(gbt.maxIter, [20, 50, 100])\
						 .addGrid(gbt.stepSize, [0.1, 0.2])\
						 .build()
cv = CrossValidator(estimator = pipeline, estimatorParamMaps = grid, evaluator = evaluator, numFolds = 5)

model = cv.fit(addingColTraining)
bestFitness = max(model.avgMetrics)
print('best fitness = ', bestFitness)
bestModel = model.bestModel
#bestModel.save('trainning_model_version3')
#model = PipelineModel.load('trainning_model2')
print("type model = ",bestModel)
print(bestModel.stages[2].explainParam('maxDepth'))
filePath = 'test.csv'
customSchema = StructType([StructField('PassengerId', IntegerType(), False),
						   StructField('PClass', StringType(), True),
						   StructField('Name', StringType(), False),
						   StructField('Sex', StringType(), True),
						   StructField('Age', FloatType(), True),
						   StructField('SibSb', StringType(), True),
						   StructField('Parch', StringType(), True),
						   StructField('Ticket', StringType(), True),
						   StructField('Fare', FloatType(), True),
						   StructField('Cabin', StringType(), True),
						   StructField('Embarked', StringType(), True)])
rawTesting = spark.read.csv(filePath, header = True, schema = customSchema)
selectedTesting = rawTesting.select('PassengerId', 'PClass', 'Sex', 'Age', 'Fare')
addingColTesting = selectedTesting.withColumn('Missing_Age', selectedTesting['Age'].isNull()).withColumn('Missing_Fare', selectedTesting['Fare'].isNull())

result = model.transform(addingColTesting).select('PassengerId', ceil(col('prediction')).alias('Survived'))
result.write.csv('output_version_rf.csv', header = True, mode = 'overwrite')
示例#15
0
def get_faces(annotate_host_probability=True, annotate_in_commerical=True):
    global _faces_cached
    if annotate_host_probability and annotate_in_commerical:
        if _faces_cached is not None:
            return _faces_cached

    faces = spark.load('query_face').alias('faces')

    videos = get_videos()
    frames = get_frames()
    haircolors = get_hair_colors()
    hairlengths = get_hair_lengths()
    clothing = get_clothing()
    faces = faces.join(
        frames, faces.frame_id == frames.id
    ).join(
        videos, frames.video_id == videos.id
    ).where(
        (videos.corrupted == False) & (videos.duplicate == False)
    ).join(
        haircolors.where(haircolors.labeler_id == Labeler.objects.get(name='haotian-hairstyle').id), 
        faces.id == haircolors.face_id, 'left_outer'
    ).join(
        hairlengths.where(hairlengths.labeler_id == Labeler.objects.get(name='haotian-hairstyle').id), 
        faces.id == hairlengths.face_id, 'left_outer'
    ).join(
        clothing.where(clothing.labeler_id == Labeler.objects.get(name='haotian-clothing').id), 
        faces.id == clothing.face_id, 'left_outer'
    ).select(
        'faces.*',
        videos.show_id,
        videos.canonical_show_id,
        videos.channel_id,
        videos.time,
        videos.fps,
        videos.week_day,
        videos.threeyears_dataset,
        frames.video_id,
        frames.number,
        haircolors.color_id.alias('haircolor_id'),
        clothing.clothing_id.alias('clothing_id'),
        hairlengths.length_id.alias('hairlength_id')
    ).where(
        ((videos.threeyears_dataset == True) & (frames.number % func.floor(videos.fps * 3) == 0)) | \
        ((videos.threeyears_dataset == False) & (frames.number % func.ceil(videos.fps * 3) == 0))
    )

    faces = faces.withColumn('height', faces.bbox_y2 - faces.bbox_y1)
    faces = faces.withColumn('width', faces.bbox_x2 - faces.bbox_x1)
    faces = faces.withColumn('area', faces.height * faces.width)
    faces = faces.withColumn('duration', func.lit(3))
    faces = faces.withColumn('min_frame', faces.number)
    faces = faces.withColumn('max_frame', faces.number + func.floor(faces.fps * 3) - 1)

    faces = _annotate_hour(faces)
    
    if annotate_in_commerical:
        faces = _annotate_in_commercial(faces)

    if annotate_host_probability:
        host_probs = get_host_probs()
        faces = faces.join(
            host_probs, faces.id == host_probs.face_id, 'left_outer'
        ).select(*faces.columns, host_probs.host_probability)
        faces = faces.na.fill({'host_probability': 0.})
       
    if annotate_host_probability and annotate_in_commerical:
        _faces_cached = faces
    
    return faces
示例#16
0
kernal = data_kernal_group.join(data_kernal_mean,data_kernal_group.date==data_kernal_mean.date).drop(data_kernal_mean.date)
kernal = kernal.withColumn('dates', F.date_format('date', 'yyyy-MM-dd')).withColumn('hours', F.date_format('date', 'HH'))


Aggregate time into one minutes and join all the features


# assign time scale in order to aggregate data into it 
#time_interval = 60
#start_timestep = 1435708800 - 7200 # 2015-07-01 00:00:00 2 hours difference 
#data = (data
#        .withColumn('timestep', F.ceil((F.unix_timestamp('dt')-sc._jsc.startTime())/time_interval))
#        .drop('radar_id')
#        )
# assign each location to the cells index
track_grid_x = track_grid.withColumn('x_categories', F.ceil((F.col('position_x') - min_lon)/interval_lon))
data = track_grid_x.withColumn('y_categories', F.ceil((F.col('position_y') - min_lat)/interval_lat))
data = data.fillna(0).drop('radar_id')
data = data.withColumn("location_index",F.concat(data.y_categories,data.x_categories)).drop('x_categories').drop('y_categories')
# join the attribute features
data_count=data.groupBy('location_index', 'dt').count()
attribute=data.groupBy('location_index', 'dt').mean('position_x','position_y',
                                                     'velocity','airspeed',
                                                     'heading','heading_vertical',
                                                     'peak_mass','mass','mass_correction')
cond = [data_count.location_index == attribute.location_index, 
        data_count.dt == attribute.dt]
data_grid = (attribute.join(data_count, cond, 'inner')
                 .drop(attribute.dt)
                 .drop(attribute.location_index)
             )
示例#17
0
    def linear_filter(self):
        print(
            "\n_______________________________________________\nLINEAR MOVEMENT FILTER\n\n"
        )

        init_cnt = self.df.count()

        # Create various partitions and sortings for downstream window functions
        w = Window().partitionBy('device_id',
                                 'study_dt').orderBy('utc_timestamp')
        l = Window().partitionBy('device_id', 'study_dt',
                                 'lin_grp').orderBy('utc_timestamp')

        # Number of pings to analyze in a group to determine linearity
        lgrp = 4

        self.df = self.df.withColumn('RecordNum',row_number().over(w)) \
            .withColumn('lin_grp', py.ceil(row_number().over(w) / lgrp)) \
            .withColumn('dist_to', distance(self.df['latitude'], self.df['longitude'], \
                  lead(self.df['latitude'],1).over(l), lead(self.df['longitude'],1).over(l),'meters')) \
            .withColumn('sequence', row_number().over(l))

        # Create aggregated table for linear groupings
        expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'), \
                py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \
                py.count(col('utc_timestamp')).alias('cnt'), \
                py.sum(col('dist_to')).alias('sum_dist'), \
                py.min(col('dist_to')).alias('min_dist')]

        #Measure the distance between first and last in each linear grouping and compare to sum distance of all points
        df_grp = self.df.groupBy('device_id', 'study_dt', 'lin_grp').agg(*expr)
        df_l = self.df.filter(self.df['sequence'].isin([1, lgrp])).join(
            df_grp, ['device_id', 'study_dt', 'lin_grp'])

        # Only keep groups that meet criteria for being straight-line
        df_j = df_l.withColumn('strt_dist', distance(df_l['latitude'],df_l['longitude'], \
                    lead(df_l['latitude'],1).over(l), \
                    lead(df_l['longitude'],1).over(l), 'meters')) \
                .withColumn('lin', col('strt_dist') / df_l['sum_dist']) \
                .na.drop(subset=['strt_dist']) \
                .filter((df_l['min_dist'] > 0)  \
                    & (col('strt_dist').between(150, 2000)) \
                    & (df_l['cnt'] == 4) \
                    & (col('lin') >= .99825)) \
                .select('device_id','lin_grp', 'lin')

        # Outer join main dataframe to linears groups to filter non-linear pings
        self.df = self.df.join(df_j, ['device_id','lin_grp'], how='left_outer') \
            .filter(col('lin').isNull()) \
            .drop('lin_grp', 'RecordNum', 'dist_to', 'sequence', 'lin')

        #lin_cnt = self.df.cache().count()
        lin_cnt = self.df.count()

        tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings before applying linear movement filter'], \
           ['Final count', lin_cnt, init_cnt - lin_cnt, ((init_cnt - lin_cnt) / float(init_cnt)) * 100, \
            'Count of pings after applying linear movement filter']]

        # Display filter table
        print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \
                                                          'Percent Reduction', 'Description']))
示例#18
0
for key in function_dict.keys():
    f = F.udf(function_dict[key][0], check_type(function_dict[key][0]))
    df = df.withColumn('%s' % key,
                       f(*[F.col(x) for x in function_dict[key][1]]))

#df.show()
print(df.dtypes)

# statistics

agg_interval = 900000000  # microseconds, so 15 mins
#agg_interval = 604800000000 # 1 week in mus
ts_col = F.col('timestamp')
columns = df.columns[1:]
df = df.withColumn('floor', (F.floor(ts_col/agg_interval) * agg_interval))\
    .withColumn('ceiling', (F.ceil(ts_col/agg_interval) * agg_interval)).orderBy(F.col('floor'))
#df.show()
#print(df.dtypes)
column = 'add100_sa'

print(columns)
# mean, median, std,
agg_df = df.groupBy('floor').agg(F.sum(column), F.min(column),
                                 F.max(column))  #, F.count(column),
#F.kurtosis(column), F.mean(column), F.skewness(column),
#F.stddev(column), F.variance(column))
dropcols = agg_df.select(
    [agg_df.where(F.isnan(F.col(c)), c).alias(c) for c in agg_df.columns])
dropcols.show()

agg_df = agg_df.drop('ceiling').dropna(how='all').drop_duplicates()
from pyspark import SparkConf, SparkContext
import pyspark.sql.functions as sf
from pyspark.sql import SparkSession

conf = SparkConf().setMaster('local[1]').setAppName('movie')
sc = SparkContext(conf = conf)
spark = SparkSession(sc)

text_file = spark.read.csv(
    "ratings.csv", 
    header=True
)

avg_df = text_file.groupBy("movieId").agg(sf.avg("rating").alias("avg_rating"))
range_df = avg_df.groupBy(sf.ceil("avg_rating").alias("Range")).agg(sf.collect_list("movieId").alias("list_of_movieId"))
changedTypedf = range_df.withColumn("list_of_movieId", range_df["list_of_movieId"].cast("string"))
changedTypedf.repartition(1).write.option("header",True).csv("output_q24")
示例#20
0
def create_covid_time_series(spark, input_df, column_offset, total_column_name, delta_column_name, include_state):
    '''
    Create and return a time series of Covid-19 data by county

    Parameters:
    spark (SparkContext): Spark context to run operations on
    input_df (DataFrame): Source Covid-19 data, either from a previous cleaning step, or loaded from disc
    total_column_name (String): Column name for the total value (case or death) in each row
    delta_column_name (String): Column name for the delta value (case or death) in each row compared to the previous day
    include_state (Boolean): Do we want to include the state column in this dataframe?
    
    Returns:
    output_df (Dataframe): Spark dataframe containing a time series of Covid-19 data over time by county
    '''
    
    print(f"Started creating Covid-19 time series data for '{total_column_name}' and '{delta_column_name}'")
    
    unix_time = pd.Timestamp("1970-01-01")
    second = pd.Timedelta('1s')
    
    date_list = [(pd.to_datetime(c) - unix_time) // second for c in input_df.columns[column_offset:]]

    time_data_columns = input_df.columns[column_offset:]
    time_data_columns.insert(0, 'fips')
        
    if include_state:
        time_data_columns.insert(1, 'state')

    time_series = []

    def extract_county_data_including_state(row):
        fips = time_data_columns[0]
        state = time_data_columns[1]
        for i in range(2, len(time_data_columns)):
            time_series.append((row[fips], row[state], date_list[i - 2], row[time_data_columns[i]]))

    def extract_county_data_excluding_state(row):
        fips = time_data_columns[0]
        for i in range(1, len(time_data_columns)):
            time_series.append((row[fips], date_list[i - 1], row[time_data_columns[i]]))

    
    if include_state:
        for row in input_df.collect():
            extract_county_data_including_state(row)
    else:
        for row in input_df.collect():
            extract_county_data_excluding_state(row)

    time_series_columns = ["fips", "timestamp", total_column_name]
    
    if include_state:
        time_series_columns.insert(1, 'state')

    output_df = spark.createDataFrame(time_series, time_series_columns)

    windowSpec = Window \
        .partitionBy(output_df['fips']) \
        .orderBy(output_df['timestamp'].asc())

    output_df = output_df.withColumn('lag', F.lag(output_df[total_column_name], 1).over(windowSpec))
    output_df = output_df.withColumn('lead', F.lead(output_df[total_column_name], 1).over(windowSpec))

    # Populate deltas
    output_df = output_df.withColumn(delta_column_name, \
        F.when(output_df['lag'].isNull(), 0) \
        .otherwise(output_df[total_column_name] - output_df['lag']))

    output_df = output_df.withColumn('next_delta', F.lead(output_df[delta_column_name], 1).over(windowSpec))

    # Fix overreporting
    output_df = output_df.withColumn(total_column_name, \
        F.when((output_df['next_delta'] >= 0) | (output_df['lag'].isNull() | (output_df['lead'].isNull())), output_df[total_column_name]) \
        .otherwise(F.ceil((output_df['lead'] + output_df['lag']) / 2)))

    # Recalculate deltas
    output_df = output_df.withColumn('lag', F.lag(output_df[total_column_name], 1).over(windowSpec))
    output_df = output_df.withColumn(delta_column_name, \
        F.when(output_df['lag'].isNull(), 0) \
        .otherwise(output_df[total_column_name] - output_df['lag']))

    output_df = output_df.drop('lag').drop('lead').drop('next_delta')
    
    print(f"Finished creating Covid-19 time series data for '{total_column_name}' and '{delta_column_name}'")
    
    return output_df
示例#21
0
#    try:
#        median = np.median(values_list) #get the median of values in a list in each row
#        return round(float(median),2)
#    except Exception:
#        return None #if there is anything wrong with the given values


def find_median(values_list):
    median = np.median(
        values_list)  #get the median of values in a list in each row
    return round(float(median), 2)


median_finder = f.udf(find_median, FloatType())

df_complete = df_complete.withColumn("Xbar", f.ceil(median_finder("Xbar")))
df_complete = df_complete.withColumn("MRbar", f.ceil(median_finder("MRbar")))
df_complete = df_complete.withColumn(
    "UCL_Individual", df_complete.Xbar + (f.lit(2.66) * df_complete.MRbar))
df_complete = df_complete.withColumn(
    "LCL_Individual", df_complete.Xbar - (f.lit(2.66) * df_complete.MRbar))


def _is_outlier(cumsum, ucl, lcl):
    if lcl <= cumsum <= ucl:
        return 0
    return 1


outlier_udf = f.udf(_is_outlier)
示例#22
0
def parquet_to_pcd(spark,
                   day_parquet,
                   day_store_dir,
                   day_base_timestamp,
                   min_bucket=1,
                   max_bucket=1441):
    def max_rows(rs):
        maxes = []
        for feat in renamed[3:]:
            maxes.append(max([r[feat] for r in rs]))
        return maxes

    def create_rows(z, y, x, zc, yc, xc, zi, yi, xi, zcw, ycw, xcw, ziw, yiw,
                    xiw, zcl, ycl, xcl, zil, yil, xil):
        # Choose correct maximum values, including when lead is null
        xcf = max(xc, xcw) if xcl is None else max(xc, xcl)
        ycf = max(yc, ycw) if ycl is None else max(yc, ycl)
        zcf = max(zc, zcw) if zcl is None else max(zc, zcl)
        xif = max(xi, xiw) if xil is None else max(xi, xil)
        yif = max(yi, yiw) if yil is None else max(yi, yil)
        zif = max(zi, ziw) if zil is None else max(zi, zil)

        # Create points
        fs = '{} {} {} {}\n{} {} {} {}\n{} {} {} {}\n'
        xf, yf, zf = float(x), float(y), float(z)
        xrgb = stall_to_float(xcf, xif)
        yrgb = stall_to_float(ycf, yif)
        zrgb = stall_to_float(zcf, zif)
        s = fs.format(xf + 0.5, yf, zf, xrgb, xf, yf + 0.5, zf, yrgb, xf, yf,
                      zf + 0.5, zrgb)
        return s

    def with_cols(df, names, cols):
        for n, c in zip(names, cols):
            df = df.withColumn(n, c)
        return df

    def rename_cols(df, old, new):
        for o, n in zip(old, new):
            df = df.withColumnRenamed(o, n)
        return df

    def create_filestring_tup(r):
        l = r[1][1] * 3
        g = header_fmt.format(l, l)
        return (r[0], g + r[1][0])

    try:
        os.mkdir(day_store_dir)
    except OSError:
        pass

    # Add bucket
    df = spark.read.parquet(day_parquet)
    df = df.withColumn('bucket',
                       F.ceil((F.col('#Time') - day_base_timestamp + 30) / 60))

    # Filter
    df = df.where((F.col('bucket') >= min_bucket) &\
                  (F.col('bucket') < max_bucket))

    # Max between the 2 compids and any extra readings
    df = df.select('bucket', *fix_stats)
    df = df.na.fill({k: 0 for k in fix_stats[3:]})  # fill nulls with 0
    df = rename_cols(df, fix_stats, renamed)
    rdd = df.rdd
    rdd = rdd.map(lambda r: ((r['bucket'], r['Z'], r['Y'], r['X']), [r]))\
             .reduceByKey(lambda a, b: a + b)\
             .map(lambda kv: list(map(int, list(kv[0]) + max_rows(kv[1]))))
    df = spark.createDataFrame(rdd, ['bucket'] + renamed)

    # Add corresponding minus directions
    xw = Window.partitionBy('bucket', 'Z', 'Y').orderBy('X')
    yw = Window.partitionBy('bucket', 'Z', 'X').orderBy('Y')
    zw = Window.partitionBy('bucket', 'Y', 'X').orderBy('Z')

    wrap_names = [
        'ZC_wrap', 'YC_wrap', 'XC_wrap', 'ZI_wrap', 'YI_wrap', 'XI_wrap'
    ]
    wrap_cols = [
        F.first('ZC-').over(zw),
        F.first('YC-').over(yw),
        F.first('XC-').over(xw),
        F.first('ZI-').over(zw),
        F.first('YI-').over(yw),
        F.first('XI-').over(xw)
    ]

    lead_names = [
        'ZC_lead', 'YC_lead', 'XC_lead', 'ZI_lead', 'YI_lead', 'XI_lead'
    ]
    lead_cols = [
        F.lead('ZC-').over(zw),
        F.lead('YC-').over(yw),
        F.lead('XC-').over(xw),
        F.lead('ZI-').over(zw),
        F.lead('YI-').over(yw),
        F.lead('XI-').over(xw)
    ]

    df = with_cols(df, wrap_names, wrap_cols)
    df = with_cols(df, lead_names, lead_cols)
    df = df.drop(*renamed[9:])

    # Calculate string
    str_args = renamed[:9] + wrap_names + lead_names
    udf_create_rows = F.udf(create_rows, StringType())
    df = df.withColumn('pt_string', udf_create_rows(*str_args))\
           .drop(*str_args)

    # Count and add headers
    rdd = df.rdd
    rdd = rdd.map(lambda r: (r['bucket'], [r['pt_string'], 1]))\
             .reduceByKey(lambda s1, s2: [s1[0] + s2[0], s1[1] + s2[1]])\
             .map(create_filestring_tup)
    file_contents = rdd.collect()

    for b, contents in file_contents:
        with open(get_bucketfile(day_store_dir, b), 'w+') as f:
            f.write(contents)
示例#23
0
df = spark.read.csv('ratings.csv', header=True)

# Remove directory where results will be stored.
shutil.rmtree('output4', ignore_errors=True, onerror=None)

# Select the two columns needed in the exercise.
(df.select('movieId', 'rating')

 # Change rating type to float.
 .withColumn('rating', df['rating'].cast('float'))

 # Group by movie id, calculating the average of the ratings.
 .groupBy('movieId').agg(avg('rating'))

 # Create a new column with the range corresponding to each film, which is the ceiling of
 # its average rating.
 .withColumn('Range', ceil('avg(rating)'))

 # We select only the range and the movie id.
 .select('Range', 'movieId')

 # We goup films by range, collecting in a list all movie ids in a range.
 .groupBy('Range').agg(collect_list('movieId').alias('ids'))

 # Sort by range (This is done to see results more clear)..
 .sort('Range')

 # We create a RDD from the dataFrame, and reduce the partitions number to one in order
 # to store results in a single file (if we don't do that it creates 199 output files).
 .rdd.coalesce(1).saveAsTextFile('output4'))
示例#24
0
#bestModel.save('trainning_model_version3')
#model = PipelineModel.load('trainning_model2')
print("type model = ", bestModel)
print(bestModel.stages[2].explainParam('maxIter'))
print(bestModel.stages[2].explainParam('regParam'))
print(bestModel.stages[2].explainParam('elasticNetParam'))
filePath = 'test.csv'
customSchema = StructType([
    StructField('PassengerId', IntegerType(), False),
    StructField('PClass', StringType(), True),
    StructField('Name', StringType(), False),
    StructField('Sex', StringType(), True),
    StructField('Age', FloatType(), True),
    StructField('SibSb', StringType(), True),
    StructField('Parch', StringType(), True),
    StructField('Ticket', StringType(), True),
    StructField('Fare', FloatType(), True),
    StructField('Cabin', StringType(), True),
    StructField('Embarked', StringType(), True)
])
rawTesting = spark.read.csv(filePath, header=True, schema=customSchema)
selectedTesting = rawTesting.select('PassengerId', 'PClass', 'Sex', 'Age',
                                    'Fare')
addingColTesting = selectedTesting.withColumn(
    'Missing_Age', selectedTesting['Age'].isNull()).withColumn(
        'Missing_Fare', selectedTesting['Fare'].isNull())

result = model.transform(addingColTesting).select(
    'PassengerId',
    ceil(col('prediction')).alias('Survived'))
result.write.csv('output_version3.csv', header=True, mode='overwrite')
    def usage(transform_context, record_store_df):
        """component which groups together record store records by
        provided group by columns list, sorts within the group by event
        timestamp field, applies group stats udf and returns the latest
        quantity as a instance usage dataframe

        This component does groups records by event_type (a.k.a metric name)
        and expects two kinds of records in record_store data
        total quantity records - the total available quantity
        e.g. cpu.total_logical_cores
        idle perc records - percentage that is idle
        e.g. cpu.idle_perc

        To calculate the utilized quantity  this component uses following
        formula:

        utilized quantity = ceil((100 - idle_perc) * total_quantity / 100)

        """

        sql_context = SQLContext.getOrCreate(record_store_df.rdd.context)

        transform_spec_df = transform_context.transform_spec_df_info

        # get rollup operation (sum, max, avg, min)
        agg_params = transform_spec_df.select(
            "aggregation_params_map.usage_fetch_operation"). \
            collect()[0].asDict()
        usage_fetch_operation = agg_params["usage_fetch_operation"]

        # check if operation is valid
        if not FetchQuantityUtil. \
                _is_valid_fetch_quantity_util_operation(usage_fetch_operation):
            raise FetchQuantityUtilException(
                "Operation %s is not supported" % usage_fetch_operation)

        # get the quantities for idle perc and quantity
        instance_usage_df = FetchQuantity().usage(
            transform_context, record_store_df)

        # get aggregation period for instance usage dataframe
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_period").collect()[0].asDict()
        aggregation_period = agg_params["aggregation_period"]
        group_by_period_list = ComponentUtils.\
            _get_instance_group_by_period_list(aggregation_period)

        # get what we want to group by
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_group_by_list").\
            collect()[0].asDict()
        aggregation_group_by_list = agg_params["aggregation_group_by_list"]

        # group by columns list
        group_by_columns_list = group_by_period_list + \
            aggregation_group_by_list

        # get quantity event type
        agg_params = transform_spec_df.select(
            "aggregation_params_map.usage_fetch_util_quantity_event_type").\
            collect()[0].asDict()
        usage_fetch_util_quantity_event_type = \
            agg_params["usage_fetch_util_quantity_event_type"]

        # check if driver parameter is provided
        if usage_fetch_util_quantity_event_type is None or \
                usage_fetch_util_quantity_event_type == "":
            raise FetchQuantityUtilException(
                "Driver parameter  '%s' is missing"
                % "usage_fetch_util_quantity_event_type")

        # get idle perc event type
        agg_params = transform_spec_df.select(
            "aggregation_params_map.usage_fetch_util_idle_perc_event_type").\
            collect()[0].asDict()
        usage_fetch_util_idle_perc_event_type = \
            agg_params["usage_fetch_util_idle_perc_event_type"]

        # check if driver parameter is provided
        if usage_fetch_util_idle_perc_event_type is None or \
                usage_fetch_util_idle_perc_event_type == "":
            raise FetchQuantityUtilException(
                "Driver parameter  '%s' is missing"
                % "usage_fetch_util_idle_perc_event_type")

        # get quantity records dataframe
        event_type_quantity_clause = "processing_meta.event_type='%s'" \
            % usage_fetch_util_quantity_event_type
        quantity_df = instance_usage_df.select('*').where(
            event_type_quantity_clause).alias("quantity_df_alias")

        # get idle perc records dataframe
        event_type_idle_perc_clause = "processing_meta.event_type='%s'" \
            % usage_fetch_util_idle_perc_event_type
        idle_perc_df = instance_usage_df.select('*').where(
            event_type_idle_perc_clause).alias("idle_perc_df_alias")

        # join quantity records with idle perc records
        # create a join condition without the event_type
        cond = [item for item in group_by_columns_list
                if item != 'event_type']
        quant_idle_perc_df = quantity_df.join(idle_perc_df, cond, 'left')

        #
        # Find utilized quantity based on idle percentage
        #
        # utilized quantity = (100 - idle_perc) * total_quantity / 100
        #
        quant_idle_perc_calc_df = quant_idle_perc_df.select(
            col("quantity_df_alias.*"),
            when(col("idle_perc_df_alias.quantity") != 0.0,
                 ceil(((100.0 - col(
                     "idle_perc_df_alias.quantity"))) * col(
                     "quantity_df_alias.quantity") / 100.0))
            .otherwise(col("quantity_df_alias.quantity"))
            .alias("utilized_quantity"),

            col("quantity_df_alias.quantity")
            .alias("total_quantity"),

            col("idle_perc_df_alias.quantity")
            .alias("idle_perc"))

        instance_usage_json_rdd = \
            quant_idle_perc_calc_df.rdd.map(
                FetchQuantityUtil._format_quantity_util)

        instance_usage_df = \
            InstanceUsageUtils.create_df_from_json_rdd(sql_context,
                                                       instance_usage_json_rdd)

        return instance_usage_df
def transform_df(df):
    df = df.withColumn("Home Team", name_changer_udf(col("Home Team"))) \
        .withColumn("Away Team", name_changer_udf(col("Away Team"))) \
        .withColumn("day_of_week", date_format('date', 'E')) \
        .filter(col("Play Off Game?") != "Y") \
        .withColumn("night_game", when(hour(col("Kick-off (local)")) >= 17, 1).otherwise(0)) \
        .withColumn("game_index", monotonically_increasing_id()) \
        .withColumn("extra_time", when(col("Over Time?") == "Y", lit(1)).otherwise(lit(0)))

    home_games = df.select(df["Home Team"].alias("team"),
                           df["Home Score"].alias("score"),
                           df["Home Odds"].alias("odds"),
                           df["Away Team"].alias("opp_team"),
                           df["Away Score"].alias("opp_score"),
                           df["Away Odds"].alias("opp_odds"),
                           df["Date"].alias("date"),
                           df["day_of_week"],
                           df["night_game"],
                           df["game_index"],
                           df["extra_time"],
                           df["Kick-off (local)"].alias("local_time")) \
        .withColumn("type", lit("home"))

    away_games = df.select(df["Away Team"].alias("team"),
                           df["Away Score"].alias("score"),
                           df["Away Odds"].alias("odds"),
                           df["Home Team"].alias("opp_team"),
                           df["Home Score"].alias("opp_score"),
                           df["Home Odds"].alias("opp_odds"),
                           df["Date"].alias("date"),
                           df["day_of_week"],
                           df["night_game"],
                           df["game_index"],
                           df["extra_time"],
                           df["Kick-off (local)"].alias("local_time")) \
        .withColumn("type", lit("away"))

    games = home_games.union(away_games)

    get_record_udf = udf(get_record, IntegerType())
    get_time_between_udf = udf(get_time_between, FloatType())

    games = games \
        .withColumn("year", year(col("date"))) \
        .withColumn("points_awarded",
                    when(col("score") > col("opp_score"),
                         lit(2)) \
                    .otherwise(when(col("score") < col("opp_score"),
                                    lit(0)) \
                               .otherwise(lit(1)))) \
        .sort(col("date"), col("local_time")) \
        .withColumn("rest",
                    datediff(col("date"),
                             lag(col("date"), 1) \
                             .over(Window. \
                                   partitionBy(col("team"), col("year")) \
                                   .orderBy(col("date"))))) \
        .withColumn("result", when(col("score") > col("opp_score"), lit(1)) \
                    .otherwise(when(col("score") < col("opp_score"),
                                    lit(0)) \
                               .otherwise(np.nan))) \
        .withColumn("win", when(col("score") > col("opp_score"), lit(1)) \
                    .otherwise(lit(0))) \
        .withColumn("loss", when(col("score") < col("opp_score"), lit(1)) \
                    .otherwise(lit(0))) \
        .withColumn("draw", when(col("score") == col("opp_score"), lit(1)) \
                    .otherwise(lit(0))) \
        .withColumn("record", collect_list(col("result")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date")))) \
        .withColumn("record_date", collect_list(col("date")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date")))) \
        .withColumn("record_extra_time", collect_list(col("extra_time")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date")))) \
        .withColumn("total_points", sum(col("points_awarded")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date"))) - col("points_awarded")) \
        .withColumn("total_points_after_game", sum(col("points_awarded")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date")))) \
        .withColumn("total_for", sum(col("score")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date"))) - col("score")) \
        .withColumn("total_for_after_game", sum(col("score")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date")))) \
        .withColumn("total_against", sum(col("opp_score")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date"))) - col("opp_score")) \
        .withColumn("total_against_after_game", sum(col("opp_score")) \
                    .over(Window. \
                          partitionBy(col("team"), col("year")) \
                          .orderBy(col("date")))) \
        .withColumn("total_for_per_game", col("total_for") / ((count("*")
                                                               .over(Window.
                                                                     partitionBy(col("team"), col("year"))
                                                                     .orderBy(col("date")))) - 1)) \
        .withColumn("total_against_per_game", col("total_against") / ((count("*")
                                                                       .over(Window.
                                                                             partitionBy(col("team"), col("year"))
                                                                             .orderBy(col("date")))) - 1)) \
        .withColumn("total_diff_per_game", col("total_for_per_game") - col("total_against_per_game")) \
        .withColumn("total_diff", col("total_for") - col("total_against")) \
        .withColumn("total_diff_after_game", col("total_for_after_game") - col("total_against_after_game")) \
        .withColumn("time_from_last_win",
                    get_time_between_udf(col("record"), col("record_date"), col("date"), lit("win"))) \
        .withColumn("time_from_last_extra_time_game",
                    get_time_between_udf(col("record_extra_time"), col("record_date"), col("date"), lit("win"))) \
        .withColumn("time_from_last_loss",
                    get_time_between_udf(col("record"), col("record_date"), col("date"), lit("loss"))) \
        .withColumn("time_from_last_draw", get_time_between_udf(col("record"), col("record_date"), col("date"))) \
        .withColumn("wins_in_a_row", get_record_udf(col("record"), lit(True))) \
        .withColumn("losses_in_a_row", get_record_udf(col("record"), lit(False)))

    pgames = games.toPandas().sort_values(
        ['game_index'], ascending=False).reset_index(drop=True)

    for index, row in pgames.iterrows():
        if (index - 16) < 0 or row['date'].year != pgames.loc[(index - 16),
                                                              'date'].year:
            pgames.loc[index, 'position'] = np.nan
        else:
            table = []
            for reindex in range(index - 1, -1, -1):
                if row['date'].year != pgames.loc[reindex, 'date'].year or len(
                        table) == 16:
                    break
                elif row['game_index'] == pgames.loc[
                        reindex,
                        'game_index'] or pgames.loc[reindex, 'team'] in [
                            i['team'] for i in table
                        ]:
                    continue
                else:
                    team_pos = {}
                    team_pos['team'] = pgames.loc[reindex, 'team']
                    team_pos['points'] = pgames.loc[reindex,
                                                    'total_points_after_game']
                    team_pos['diff'] = pgames.loc[reindex,
                                                  'total_diff_after_game']
                    table.append(team_pos)
            table = sorted(table,
                           key=itemgetter('points', 'diff'),
                           reverse=True)
            info_by_team = build_dict(table, key='team')
            pgames.loc[index,
                       'position'] = info_by_team[row['team']]['index'] + 1

    for index, row in pgames.iterrows():
        if (index - 16) < 0 or row['date'].year != pgames.loc[(index - 16),
                                                              'date'].year:
            pgames.loc[index, 'position'] = np.nan
        else:
            table = []
            for reindex in range(index, -1, -1):
                if row['date'].year != pgames.loc[reindex, 'date'].year or len(
                        table) == 16:
                    break
                elif pgames.loc[reindex, 'team'] in [i['team'] for i in table]:
                    continue
                else:
                    team_pos = {}
                    team_pos['team'] = pgames.loc[reindex, 'team']
                    team_pos['points'] = pgames.loc[reindex,
                                                    'total_points_after_game']
                    team_pos['diff'] = pgames.loc[reindex,
                                                  'total_diff_after_game']
                    table.append(team_pos)
            table = sorted(table,
                           key=itemgetter('points', 'diff'),
                           reverse=True)
            info_by_team = build_dict(table, key='team')
            pgames.loc[
                index,
                'position_after_game'] = info_by_team[row['team']]['index'] + 1

    games = op.create \
        .df(pdf=pgames) \
        .withColumn("ranking_quantile",
                    when(col("position") != np.nan,
                         ceil(col("position") / 4)) \
                    .otherwise(np.nan)) \
        .withColumn("opp_position", sum(col("position")) \
                    .over(Window.partitionBy("game_index")) - col("position")) \
        .withColumn("opp_ranking_quantile",
                    when(col("opp_position") != np.nan,
                         ceil(col("opp_position") / 4)) \
                    .otherwise(np.nan)) \
        .withColumn("previous_opp_position",
                    lag(col("opp_position"), 1) \
                    .over(Window.partitionBy("team").orderBy(col("game_index")))) \
        .withColumn("previous_opp_ranking_quantile",
                    when(col("previous_opp_position") != np.nan,
                         ceil(col("previous_opp_position") / 4)) \
                    .otherwise(np.nan)) \
        .withColumn("previous_result",
                    lag(col("points_awarded"), 1) \
                    .over(Window.partitionBy("team").orderBy(col("game_index")))) \
        .withColumn("previous_result_ranking", col("previous_result") * col("previous_opp_ranking_quantile"))

    home_games = games \
        .filter(col("type") == "home") \
        .select(col("team").alias("home_team"),
                col("odds").alias("home_odds"),
                col("opp_team").alias("opp_away_team"),
                col("score").alias("home_score"),
                col("date"),
                col("local_time"),
                col("day_of_week"),
                col("night_game"),
                col("game_index"),
                col("wins_in_a_row").alias("home_wins_in_a_row"),
                col("losses_in_a_row").alias("home_losses_in_a_row"),
                col("position").alias("home_position"),
                col("ranking_quantile").alias("home_ranking_quantile"),
                col("total_points").alias("home_points"),
                col("total_for_per_game").alias("home_for_per_game"),
                col("total_against_per_game").alias("home_against_per_game"),
                col("previous_result_ranking").alias("home_previous_result"),
                col("time_from_last_win").alias("home_time_from_last_win"),
                col("time_from_last_extra_time_game").alias("home_time_from_last_extra_time_game"),
                col("time_from_last_loss").alias("home_time_from_last_loss"),
                col("time_from_last_draw").alias("home_time_from_last_draw"),
                col("rest").alias("home_rest"))

    away_games = games \
        .filter(col("type") == "away") \
        .select(col("team").alias("away_team"),
                col("odds").alias("away_odds"),
                col("opp_team").alias("opp_home_team"),
                col("score").alias("away_score"),
                col("position").alias("away_position"),
                col("total_points").alias("away_points"),
                col("wins_in_a_row").alias("away_wins_in_a_row"),
                col("losses_in_a_row").alias("away_losses_in_a_row"),
                col("ranking_quantile").alias("away_ranking_quantile"),
                col("total_for_per_game").alias("away_for_per_game"),
                col("total_against_per_game").alias("away_against_per_game"),
                col("rest").alias("away_rest"),
                col("previous_result_ranking").alias("away_previous_result"),
                col("time_from_last_win").alias("away_time_from_last_win"),
                col("time_from_last_extra_time_game").alias("away_time_from_last_extra_time_game"),
                col("time_from_last_loss").alias("away_time_from_last_loss"),
                col("time_from_last_draw").alias("away_time_from_last_draw"),
                col("date").alias("away_date"))

    df = home_games.join(away_games, (home_games['home_team'] == away_games['opp_home_team']) & \
                         (home_games['opp_away_team'] == away_games['away_team']) & \
                         (home_games['date'] == away_games['away_date'])) \
        .drop("opp_away_team", "opp_home_team", "away_date") \
        .withColumn("rest_spread", col("home_rest") - col("away_rest")) \
        .withColumn("game_id", monotonically_increasing_id()) \
        .withColumn("home_win", when(col("home_score") > col("away_score"), lit(1)) \
                    .otherwise(when(col("home_score") < col("away_score"),
                                    lit(0)) \
                               .otherwise(np.nan))) \
        .withColumn("winner", when(col("home_score") > col("away_score"), lit("home")) \
                    .otherwise(when(col("home_score") < col("away_score"),
                                    lit("away")) \
                               .otherwise("draw"))) \
        .withColumn("margin", col("home_score") - col("away_score")) \
        .withColumn("year", year(col("date"))) \
        .withColumn("hour", hour(col("local_time"))) \
        .withColumn("game_id_season", row_number().over(Window.partitionBy(col("year")).orderBy(col("date")))) \
        .withColumn("first_round", when(col("game_id_season") <= 8, lit(1)).otherwise(lit(0))) \
        .withColumn("second_round",
                    when((col("game_id_season") <= 16) & (col("game_id_season") > 8), lit(1)).otherwise(lit(0))) \
        .drop("local_time")

    pdf = df.toPandas()

    return pdf
# # Final calculation

dm_final = dm_with_fourweek.withColumn(
    "dm_order_qty_without_pcb", dm_with_fourweek.regular_sales_before_dm +
    dm_with_fourweek.four_weeks_after_dm + dm_with_fourweek.dm_sales)

dm_final = dm_final \
        .withColumn("first_dm_order_qty_without_pcb",
                F.when(dm_final.rotation != 'X', 0.75 * dm_final.dm_order_qty_without_pcb)
                .otherwise(dm_final.dm_order_qty_without_pcb))

dm_final = dm_final \
        .withColumn("first_dm_order_qty",
                    F.when(dm_final.first_dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_final.first_dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb)
                    .otherwise(0))

dm_final_pcb = dm_final \
        .withColumn("dm_order_qty",
                    F.when(dm_final.dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_final.dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb)
                    .otherwise(0))

# +
dm_final_pcb = dm_final_pcb.withColumn(
    "first_dm_order_qty", dm_final_pcb["first_dm_order_qty"].cast("Int"))

dm_final_pcb = dm_final_pcb.withColumn(
    "dm_order_qty", dm_final_pcb["dm_order_qty"].cast("Int"))
# -
def dm_order_simulation(date_str):
    warehouse_location = abspath('spark-warehouse')

    print_output(
        f'\n Forecast simulation process for DM start with input date {date_str} \n'
    )

    # for logging
    output_str = ""
    info_str = f"Job start:{get_current_time()}, "

    spark = SparkSession.builder \
        .appName("Forecast process for DM") \
        .config("spark.sql.warehouse.dir", warehouse_location) \
        .config("spark.driver.memory", '6g') \
        .config("spark.executor.memory", '6g') \
        .config("spark.num.executors", '14') \
        .config("hive.exec.compress.output", 'false') \
        .config("spark.sql.broadcastTimeout", 7200) \
        .config("spark.sql.autoBroadcastJoinThreshold", -1) \
        .enableHiveSupport() \
        .getOrCreate()

    sc = spark.sparkContext

    sqlc = SQLContext(sc)

    print_output('Spark environment loaded')

    run_date = datetime.datetime.strptime(date_str, '%Y%m%d').date()

    # starting day of the DM calculation period
    start_date = run_date + timedelta(weeks=4)

    # end day of the DM calculation period
    end_date = run_date + timedelta(weeks=5)

    stock_date = run_date + timedelta(days=-1)

    parameter = "Run date:" + run_date.strftime("%Y%m%d") \
                + ", DM start date:" + start_date.strftime("%Y%m%d") \
                + ", DM end date:" + end_date.strftime("%Y%m%d")

    print_output(
        f"Load DM items and stores for DM that starts between {start_date} and {end_date}"
    )

    dm_item_store_sql = \
        """
        SELECT distinct ndt.dm_theme_id,
            ndt.theme_start_date,
            ndt.theme_end_date,
            del.npp,
            del.ppp,
            del.ppp_start_date,
            del.ppp_end_date,
            del.city_code,
            id.store_code,
            del.dept_code,
            id.con_holding,
            id.risk_item_unilever,
            cast(id.qty_per_unit as int) as pcb,
            id.dc_supplier_code,
            id.ds_supplier_code,
            id.rotation,
            icis.item_id,
            icis.sub_id,
            icis.item_code,
            icis.sub_code,
            icis.date_key AS run_date,
            fdo.first_order_date AS past_result
        FROM vartefact.forecast_nsa_dm_extract_log del
        JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id
        JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity
        JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code
            AND del.item_code = CONCAT (
                id.dept_code,
                id.item_code
                )
            AND del.sub_code = id.sub_code
            AND del.dept_code = id.dept_code
            AND id.store_status != 'Stop'
            AND id.item_type not in ('New','Company Purchase','Seasonal')
        JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}'
            AND id.item_code = icis.item_code
            AND id.sub_code = icis.sub_code
            AND id.dept_code = icis.dept_code
            AND id.store_code = icis.store_code
        LEFT JOIN vartefact.forecast_simulation_dm_orders fdo ON ndt.dm_theme_id = fdo.dm_theme_id
            AND icis.dept_code = fdo.dept_code
            AND icis.item_code = fdo.item_code
            AND icis.sub_code = fdo.sub_code
            AND icis.store_code = fdo.store_code
        WHERE del.extract_order >= 40
            AND del.date_key = '{1}'
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd')
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd')
        """.replace("\n", " ")

    dm_item_store_sql = dm_item_store_sql.format(stock_date.strftime("%Y%m%d"),
                                                 run_date.strftime("%Y%m%d"),
                                                 start_date.strftime("%Y%m%d"),
                                                 end_date.strftime("%Y%m%d"))

    # # Exclude the DM that already have orders

    dm_item_store_df = sqlc.sql(dm_item_store_sql)

    print_output(
        f"Number of DM item stores in date range {dm_item_store_df.count()}")

    print_output("Exclude the DM that already have orders")

    dm_item_store_df = dm_item_store_df.filter("past_result is null")

    output_line = f"After filtering already calculated DM {dm_item_store_df.count()}"

    print_output(output_line)
    output_str = output_str + output_line + ","

    # # Only consider the nearest DM

    first_dm = dm_item_store_df. \
        groupBy(['item_id', 'sub_id', 'store_code']). \
        agg(F.min("theme_start_date").alias("theme_start_date"))

    dm_item_store_df = dm_item_store_df.join(
        first_dm, ['item_id', 'sub_id', 'store_code', 'theme_start_date'])

    dm_item_store_cnt = dm_item_store_df.count()

    print_output(f"After getting only first DM {dm_item_store_cnt}")
    output_str = output_str + f"After getting only first DM {dm_item_store_cnt}," + ","

    if dm_item_store_cnt == 0:
        print_output(
            f"skip date {date_str} cause no active order opportunity for today"
        )
        info_str = info_str + f"Job Finish:{get_current_time()},"
        info_str = info_str + f"skip date {date_str} cause no active order opportunity for today"
        insert_script_run(date_str, "Success", parameter, output_str, info_str,
                          "", sqlc)
        return

    dm_item_store_df.createOrReplaceTempView("dm_item_store")

    # # The first order day within PPP period
    print_output("Get first order day within PPP period")
    onstock_order_sql = \
        """
        SELECT dis.item_id,
            dis.sub_id,
            dis.store_code,
            ord.date_key AS first_order_date,
            dev.date_key AS first_delivery_date
        FROM dm_item_store dis
        JOIN vartefact.forecast_onstock_order_delivery_mapping mp ON dis.dept_code = mp.dept_code
            AND dis.rotation = mp.rotation
            AND dis.store_code = mp.store_code
        JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = mp.order_iso_weekday
        JOIN vartefact.forecast_calendar dev ON dev.iso_weekday = mp.delivery_iso_weekday
            AND dev.week_index = ord.week_index + mp.week_shift
        WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd')
            AND to_timestamp(dev.date_key, 'yyyyMMdd') >= date_add(to_timestamp(dis.theme_start_date, 'yyyy-MM-dd'), -7)
            AND dev.date_key <= '{0}'
        """.replace("\n", " ")

    onstock_order_sql = onstock_order_sql.format(end_date.strftime("%Y%m%d"))

    onstock_order_deliver_df = sqlc.sql(onstock_order_sql)

    xdock_order_sql = \
        """
        SELECT dis.item_id,
            dis.sub_id,
            dis.store_code,
            ord.date_key AS first_order_date,
            date_format(
                date_add(
                    to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time
                    ),
                'yyyyMMdd'
            ) AS first_delivery_date
        FROM dm_item_store dis
        JOIN vartefact.forecast_xdock_order_mapping xo ON dis.item_code = xo.item_code
            AND dis.sub_code = xo.sub_code
            AND dis.dept_code = xo.dept_code
            AND dis.store_code = xo.store_code
        JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = xo.order_iso_weekday
        JOIN vartefact.forecast_dc_order_delivery_mapping dodm ON dodm.con_holding = dis.con_holding
            AND dodm.order_date = ord.date_key
            AND dis.risk_item_unilever = dodm.risk_item_unilever
        WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd')
            AND date_add(to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time)  <= to_timestamp('{0}', 'yyyyMMdd')
        """.replace("\n", " ")

    xdock_order_sql = xdock_order_sql.format(end_date.strftime("%Y%m%d"))

    xdock_order_deliver_df = sqlc.sql(xdock_order_sql)

    order_deliver_df = onstock_order_deliver_df.union(xdock_order_deliver_df)

    first_order_df = order_deliver_df.groupBy(['item_id', 'sub_id', 'store_code']). \
        agg(F.min("first_order_date").alias("first_order_date"))

    first_order_deliver_df = order_deliver_df \
        .select(['item_id', 'sub_id', 'store_code', 'first_order_date', 'first_delivery_date']) \
        .join(first_order_df, ['item_id', 'sub_id', 'store_code', 'first_order_date'])

    dm_item_store_order_df = dm_item_store_df \
        .join(first_order_deliver_df, \
              ['item_id', 'sub_id', 'store_code'])

    dm_item_store_order_df.createOrReplaceTempView("dm_item_store_order")

    output_line = f"Number of item stores that will have DM {dm_item_store_order_df.count()}"
    print_output(output_line)
    output_str = output_str + output_line + ","

    # # Get DM sales prediction

    dm_sales_predict_sql = \
        """
        select 
          dm.*,
          cast(coalesce(pred.sales_prediction, '0', pred.sales_prediction) as double) as dm_sales,
          coalesce(pred.sales_prediction, 'no', 'yes') as having_dm_prediction
        from 
            dm_item_store_order dm
        left join temp.v_forecast_simulation_dm_sales_prediction pred
            on cast(pred.item_id as int) = dm.item_id
            and cast(pred.sub_id as int) = dm.sub_id
            and cast(pred.current_dm_theme_id as int) = dm.dm_theme_id
            and pred.store_code = dm.store_code
        """.replace("\n", " ")

    dm_prediction = sqlc.sql(dm_sales_predict_sql)

    dm_prediction.filter("having_dm_prediction = 'no' ") \
        .write.mode("overwrite").format("parquet") \
        .saveAsTable("vartefact.forecast_no_dm_prediction")

    dm_prediction.createOrReplaceTempView("dm_prediction")

    output_line = f"Number of DM sales prediction {dm_prediction.count()}"
    print_output(output_line)
    output_str = output_str + output_line + ","

    # # Regular sales from first order day to DM start day
    print_output("Regular sales before DM")

    dm_regular_sales_sql = \
        """
        SELECT dp.item_id,
            dp.sub_id,
            dp.store_code,
            dp.dm_theme_id,
            case when
              fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A'
            then 0
            when
              fcst.daily_sales_prediction_original < 0
            then 0
            else fcst.daily_sales_prediction_original 
            end AS sales_prediction
        FROM temp.t_forecast_simulation_daily_sales_prediction fcst
        JOIN dm_prediction dp ON fcst.item_id = dp.item_id
            AND fcst.sub_id = dp.sub_id
            AND fcst.store_code = dp.store_code
            AND fcst.date_key > dp.first_delivery_date
            AND to_timestamp(fcst.date_key, 'yyyyMMdd') < to_timestamp(dp.theme_start_date, 'yyyy-MM-dd')
        """.replace("\n", " ")

    dm_regular_sales = sqlc.sql(dm_regular_sales_sql)

    agg_dm_regular_sales = dm_regular_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \
        agg(F.sum("sales_prediction").alias("regular_sales_before_dm"))

    dm_with_regular = dm_prediction.join(
        agg_dm_regular_sales,
        ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left")

    # # For ppp <= 90% npp, get 4 weeks after sales for ROTATION A items
    print_output("DM PPP logic")

    after_fourweek_sql = \
        """
        SELECT dp.item_id,
            dp.sub_id,
            dp.store_code,
            dp.dm_theme_id,
            case when
              fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A'
            then 0
            when
              fcst.daily_sales_prediction_original < 0
            then 0
            else fcst.daily_sales_prediction_original 
            end AS sales_prediction
        FROM dm_prediction dp
        JOIN temp.t_forecast_simulation_daily_sales_prediction fcst ON fcst.item_id = dp.item_id
            AND fcst.sub_id = dp.sub_id
            AND fcst.store_code = dp.store_code
            AND to_timestamp(fcst.date_key, 'yyyyMMdd') > to_timestamp(dp.theme_end_date, 'yyyy-MM-dd')
            AND to_timestamp(fcst.date_key, 'yyyyMMdd') < date_add(to_timestamp(dp.theme_end_date, 'yyyy-MM-dd'), 28)
        WHERE dp.rotation = 'A'
            AND dp.ppp <= dp.npp * 0.9
        """.replace("\n", " ")

    after_fourweek_sales = sqlc.sql(
        after_fourweek_sql.format(run_date.strftime("%Y%m%d")))

    agg_after_fourweek_sales = after_fourweek_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \
        agg(F.sum("sales_prediction").alias("four_weeks_after_dm"))

    output_line = f"Number of DM having PPP {agg_after_fourweek_sales.count()}"
    print_output(output_line)
    output_str = output_str + output_line + ","

    dm_with_fourweek = dm_with_regular.join(
        agg_after_fourweek_sales,
        ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left")

    # # Fill NA

    dm_with_fourweek = dm_with_fourweek.na.fill(0)
    dm_with_fourweek.cache()

    output_line = f"Number of DM store orders {dm_with_fourweek.count()}"
    print_output(output_line)
    output_str = output_str + output_line

    # # Final calculation

    print_output("Calculate order quantity")
    dm_final = dm_with_fourweek.withColumn(
        "dm_order_qty_without_pcb", dm_with_fourweek.regular_sales_before_dm +
        dm_with_fourweek.four_weeks_after_dm + dm_with_fourweek.dm_sales)

    dm_final = dm_final \
        .withColumn("first_dm_order_qty_without_pcb",
                    F.when(dm_final.rotation != 'X', 0.75 * dm_final.dm_order_qty_without_pcb)
                    .otherwise(dm_final.dm_order_qty_without_pcb))

    dm_final = dm_final \
        .withColumn("first_dm_order_qty",
                    F.when(dm_final.first_dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_final.first_dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb)
                    .otherwise(int(0)))

    dm_final_pcb = dm_final \
        .withColumn("dm_order_qty",
                    F.when(dm_final.dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_final.dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb)
                    .otherwise(int(0)))

    dm_final_pcb.createOrReplaceTempView("dm_final_pcb")

    print_output("Write store order to datalake")

    dm_sql = \
        """
        INSERT INTO vartefact.forecast_simulation_dm_orders
        PARTITION (dm_theme_id)
        SELECT 
            item_id,
            sub_id,
            store_code,
            con_holding,
            theme_start_date,
            theme_end_date,
            npp,
            ppp,
            ppp_start_date,
            ppp_end_date,
            city_code,
            dept_code,
            item_code,
            sub_code,
            pcb,
            dc_supplier_code,
            ds_supplier_code,
            rotation,
            run_date,
            first_order_date,
            first_delivery_date,
            regular_sales_before_dm,
            four_weeks_after_dm,
            dm_sales,
            dm_order_qty,
            first_dm_order_qty,
            dm_order_qty_without_pcb,
            dm_theme_id
        FROM dm_final_pcb
        """.replace("\n", " ")

    sqlc.sql(dm_sql)

    sqlc.sql("refresh table vartefact.forecast_simulation_dm_orders")

    print_output("Finish writing store order to datalake")

    print_output("Start generating DC orders")

    dm_item_dc_sql = \
        """
        SELECT distinct ndt.dm_theme_id,
            ndt.theme_start_date,
            ndt.theme_end_date,
            del.npp,
            del.ppp,
            del.ppp_start_date,
            del.ppp_end_date,
            del.dept_code,
            dcid.holding_code,
            dcid.risk_item_unilever,
            dcid.primary_ds_supplier as ds_supplier_code,
            cast(dcid.qty_per_unit as int) as pcb,
            dcid.rotation,
            dcid.qty_per_unit,
            icis.item_id,
            icis.sub_id,
            icis.item_code,
            icis.sub_code,
            icis.date_key AS run_date
        FROM vartefact.forecast_nsa_dm_extract_log del
        JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id
        JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity
        JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}'
            AND del.item_code = CONCAT (
                icis.dept_code,
                icis.item_code
                )
            AND del.sub_code = icis.sub_code
            AND del.dept_code = icis.dept_code
        JOIN vartefact.forecast_dc_item_details dcid ON dcid.item_code =icis.item_code
            AND dcid.sub_code = icis.sub_code
            AND dcid.dept_code = icis.dept_code
            AND dcid.rotation != 'X'
            AND dcid.dc_status != 'Stop'
            AND dcid.seasonal = 'No'
            AND dcid.item_type not in ('New','Company Purchase','Seasonal')
        JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code
            AND dcid.dept_code = id.dept_code
            AND dcid.item_code = id.item_code
            AND dcid.sub_code = id.sub_code
        WHERE del.extract_order >= 40
            AND del.date_key = '{1}'
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd')
            AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd')
        """.replace("\n", " ")

    dm_item_dc_sql = dm_item_dc_sql.format(stock_date.strftime("%Y%m%d"),
                                           run_date.strftime("%Y%m%d"),
                                           start_date.strftime("%Y%m%d"),
                                           end_date.strftime("%Y%m%d"))

    dm_item_dc_df = sqlc.sql(dm_item_dc_sql)

    first_dc_dm = dm_item_dc_df. \
        groupBy(['item_id', 'sub_id']). \
        agg(F.min("theme_start_date").alias("theme_start_date"))

    dm_item_dc_df = dm_item_dc_df.join(
        first_dc_dm, ['item_id', 'sub_id', 'theme_start_date'])

    output_line = f"Number of item that will have DM order in DC {dm_item_dc_df.count()}"
    print_output(output_line)
    output_str = output_str + output_line + ","

    dm_item_dc_df.cache()

    dm_item_dc_df.createOrReplaceTempView("dm_item_dc")

    # +
    dc_order_sql = \
        """
        SELECT distinct dis.item_id,
            dis.sub_id,
            ord.date_key AS first_order_date,
            dev.date_key AS first_delivery_date
        FROM dm_item_dc dis
        JOIN vartefact.forecast_dc_order_delivery_mapping dodm
            ON dis.holding_code = dodm.con_holding
            AND dis.risk_item_unilever = dodm.risk_item_unilever
        JOIN vartefact.forecast_calendar ord
            ON ord.date_key = dodm.order_date
        JOIN vartefact.forecast_calendar dev
            ON dev.weekday_short = dodm.delivery_weekday and dev.week_index = ord.week_index + dodm.week_shift
        WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd')
            AND dev.date_key <= '{0}'
            AND dis.rotation != 'X'
        """.replace("\n", " ")

    dc_order_sql = dc_order_sql.format(end_date.strftime("%Y%m%d"))

    # +
    dc_order_deliver_df = sqlc.sql(dc_order_sql)

    dc_first_order_df = dc_order_deliver_df.groupBy(['item_id', 'sub_id']). \
        agg(F.min("first_order_date").alias("first_order_date"))

    dc_first_order_deliver_df = dc_order_deliver_df \
        .select(['item_id', 'sub_id', 'first_order_date', 'first_delivery_date']) \
        .join(dc_first_order_df, ['item_id', 'sub_id', 'first_order_date'])
    # -

    dm_item_dc_order_df = dm_item_dc_df \
        .join(dc_first_order_deliver_df, \
              ['item_id', 'sub_id'])

    dm_item_dc_order_df.createOrReplaceTempView("dm_item_dc_order")

    dm_store_to_dc_sql = \
        """
        select 
          dm.item_id,
          dm.sub_id,
          dm.holding_code,
          dm.theme_start_date,
          dm.theme_end_date,
          dm.npp,
          dm.ppp,
          dm.ppp_start_date,
          dm.ppp_end_date,
          dm.dept_code,
          dm.item_code,
          dm.sub_code,
          dm.pcb,
          dm.ds_supplier_code,
          dm.rotation,
          dm.run_date,
          dm.first_order_date,
          dm.first_delivery_date,
          sum(sod.regular_sales_before_dm) as regular_sales_before_dm,
          sum(sod.four_weeks_after_dm) as four_weeks_after_dm,
          sum(sod.dm_sales) as dm_sales,
          sum(sod.order_qty) as dm_order_qty_without_pcb,
          dm.dm_theme_id
        FROM 
            vartefact.forecast_simulation_dm_orders sod
        JOIN dm_item_dc_order dm
            on sod.item_id = dm.item_id
            and sod.sub_id = dm.sub_id
            and sod.dm_theme_id = dm.dm_theme_id
        GROUP BY
          dm.dm_theme_id,
          dm.item_id,
          dm.sub_id,
          dm.holding_code,
          dm.theme_start_date,
          dm.theme_end_date,
          dm.npp,
          dm.ppp,
          dm.ppp_start_date,
          dm.ppp_end_date,
          dm.dept_code,
          dm.item_code,
          dm.sub_code,
          dm.pcb,
          dm.ds_supplier_code,
          dm.rotation,
          dm.run_date,
          dm.first_order_date,
          dm.first_delivery_date
        """.replace("\n", " ")

    dm_dc_order = sqlc.sql(dm_store_to_dc_sql)

    dm_dc_pcb = dm_dc_order \
        .withColumn("dm_order_qty",
                    F.when(dm_dc_order.dm_order_qty_without_pcb > 0.0,
                           F.ceil(dm_dc_order.dm_order_qty_without_pcb / dm_dc_order.pcb) * dm_dc_order.pcb)
                    .otherwise(int(0)))

    dm_dc_pcb.createOrReplaceTempView("dm_dc_final")

    output_line = f"Number of DM DC orders {dm_dc_pcb.count()}"
    print_output(output_line)
    output_str = output_str + output_line

    print_output("Write DC order to datalake")

    dm_dc_sql = \
        """
        INSERT INTO vartefact.forecast_simulation_dm_dc_orders
        PARTITION (dm_theme_id)
        SELECT 
          item_id,
          sub_id,
          holding_code,
          theme_start_date,
          theme_end_date,
          npp,
          ppp,
          ppp_start_date,
          ppp_end_date,
          dept_code,
          item_code,
          sub_code,
          pcb,
          ds_supplier_code,
          rotation,
          run_date,
          first_order_date,
          first_delivery_date,
          regular_sales_before_dm,
          four_weeks_after_dm,
          dm_sales,
          dm_order_qty,
          dm_order_qty_without_pcb,
          dm_theme_id
        FROM dm_dc_final
        """.replace("\n", " ")

    # +
    sqlc.sql(dm_dc_sql)

    sqlc.sql("refresh table vartefact.forecast_simulation_dm_dc_orders")

    info_str = info_str + f"Job Finish:{get_current_time()}"
    insert_script_run(date_str, "Success", parameter, output_str, info_str, "",
                      sqlc)

    sc.stop()
    print_output("Job finish")
示例#29
0
def fillspark(hist, df):
    import pyspark.sql.functions as fcns

    indexes = []
    for axis in hist._group + hist._fixed:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))

        if isinstance(axis, histbook.axis.groupby):
            indexes.append(exprcol)

        elif isinstance(axis, histbook.axis.groupbin):
            scaled = (exprcol - float(axis.origin)) * (1.0 /
                                                       float(axis.binwidth))
            if axis.closedlow:
                discretized = fcns.floor(scaled)
            else:
                discretized = fcns.ceil(scaled) - 1
            indexes.append(
                fcns.nanvl(
                    discretized * float(axis.binwidth) + float(axis.origin),
                    fcns.lit("NaN")))

        elif isinstance(axis, histbook.axis.bin):
            scaled = (exprcol -
                      float(axis.low)) * (int(axis.numbins) /
                                          (float(axis.high) - float(axis.low)))
            if axis.closedlow:
                discretized = fcns.floor(scaled) + 1
            else:
                discretized = fcns.ceil(scaled)
            indexes.append(
                fcns.when(
                    fcns.isnull(exprcol) | fcns.isnan(exprcol),
                    int(axis.numbins) + 2).otherwise(
                        fcns.greatest(
                            fcns.lit(0),
                            fcns.least(fcns.lit(int(axis.numbins) + 1),
                                       discretized))))

        elif isinstance(axis, histbook.axis.intbin):
            indexes.append(
                fcns.greatest(
                    fcns.lit(0),
                    fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1),
                               fcns.round(exprcol - int(axis.min) + 1))))

        elif isinstance(axis, histbook.axis.split):

            def build(x, i):
                if i < len(axis.edges):
                    if axis.closedlow:
                        return build(x.when(exprcol < float(axis.edges[i]), i),
                                     i + 1)
                    else:
                        return build(
                            x.when(exprcol <= float(axis.edges[i]), i), i + 1)
                else:
                    return x.otherwise(i)

            indexes.append(
                build(
                    fcns.when(
                        fcns.isnull(exprcol) | fcns.isnan(exprcol),
                        len(axis.edges) + 1), 0))

        elif isinstance(axis, histbook.axis.cut):
            indexes.append(fcns.when(exprcol, 0).otherwise(1))

        else:
            raise AssertionError(axis)

    aliasnum = [-1]

    def alias(x):
        aliasnum[0] += 1
        return x.alias("@" + str(aliasnum[0]))

    index = alias(fcns.struct(*indexes))

    selectcols = [index]
    if hist._weightoriginal is not None:
        weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed))
    for axis in hist._profile:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))
        if hist._weightoriginal is None:
            selectcols.append(alias(exprcol))
            selectcols.append(alias(exprcol * exprcol))
        else:
            selectcols.append(alias(exprcol * weightcol))
            selectcols.append(alias(exprcol * exprcol * weightcol))

    if hist._weightoriginal is None:
        df2 = df.select(*selectcols)
    else:
        selectcols.append(alias(weightcol))
        selectcols.append(alias(weightcol * weightcol))
        df2 = df.select(*selectcols)

    aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]]
    if hist._weightoriginal is None:
        aggs.append(fcns.count(df2[df2.columns[0]]))

    def getornew(content, key, nextaxis):
        if key in content:
            return content[key]
        elif isinstance(nextaxis, histbook.axis.GroupAxis):
            return {}
        else:
            return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE)

    def recurse(index, columns, axis, content):
        if len(axis) == 0:
            content += columns

        elif isinstance(axis[0],
                        (histbook.axis.groupby, histbook.axis.groupbin)):
            content[index[0]] = recurse(
                index[1:], columns, axis[1:],
                getornew(content, index[0],
                         axis[1] if len(axis) > 1 else None))
            if isinstance(axis[0], histbook.axis.groupbin) and None in content:
                content["NaN"] = content[None]
                del content[None]

        elif isinstance(
                axis[0],
            (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)):
            i = index[0] - (1 if not axis[0].underflow else 0)
            if int(i) < axis[0].totbins:
                recurse(index[1:], columns, axis[1:], content[int(i)])

        elif isinstance(axis[0], histbook.axis.cut):
            recurse(index[1:], columns, axis[1:],
                    content[0 if index[0] else 1])

        else:
            raise AssertionError(axis[0])

        return content

    query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs)

    def wait():
        for row in query.collect():
            recurse(row[0], row[1:], hist._group + hist._fixed, hist._content)

    return wait
示例#30
0
from functools import partial

spark = SparkSession.builder.appName("some_testing2").master("local").getOrCreate()

df = spark.read.format('com.databricks.spark.csv').option("header", "True").option("delimiter", ",")\
                      .load('C:/Users/awagner/Desktop/For_Tom/'+'AllLabData.csv')

df = df.withColumn("X", df["X"].cast("double"))
df = df.withColumn("Y", df["Y"].cast("double"))
df = df.withColumn("Z", df["Z"].cast("double"))
df = df.withColumn("TremorGA", df["TremorGA"].cast("double"))
df = df.withColumn("BradykinesiaGA", df["BradykinesiaGA"].cast("double"))
df = df.withColumn("DyskinesiaGA", df["DyskinesiaGA"].cast("double"))
df = df.withColumn("TSStart", df["TSStart"].cast("timestamp"))
df = df.withColumn("TSEnd", df["TSEnd"].cast("timestamp"))
df = df.withColumn("interval_start", ((ceil(unix_timestamp(df["TSStart"]).cast("long")))%)) 
df = df.withColumn("interval_end", ((ceil(unix_timestamp(df["TSEnd"]).cast("long"))))) 


schema = ArrayType(FloatType(), False)
parse2 = udf(lambda s: eval(str(s)), schema)
find_milisec = udf(lambda raw: (raw)[(raw.find('.')+1):(raw.find('.')+3)])
merge_integers = udf(lambda raw1, raw2: int(str(raw1) + str(raw2)))
df = df.withColumn("temp", find_milisec('TS')) 
df = df.withColumn("interval", (((unix_timestamp(df["TS"]).cast("long"))))) 
df = df.withColumn("interval", merge_integers('interval', 'temp'))



def give_my_key(*args):
    key = 0
示例#31
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)