Exemplo n.º 1
0
    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Exemplo n.º 2
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
Exemplo n.º 3
0
 def train(self, df, featureCols):
     
     ntiles = []
     for col in featureCols:
         w = Window.partitionBy().orderBy(col)
         aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col)
         ntiles.append(list(aux.groupby('ntile').max(col).collect()))
         
     self.ntiles_ = np.array(ntiles)
     self.columns_ = map(str,featureCols)
     self._is_trained = True
Exemplo n.º 4
0
def compute(day):
    # On veut les jours day-30 à day-1
    sums = wikipediadata.where(
            (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))

    # Sous-ensemble de test
    #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))

    # On somme les heures de la journées
    sums = sums.groupby('projectcode', 'page', 'day').sum('views')
    # On cache pour plus tard
    sums.cache()

    # on définit une windows := jour precedent
    window_spec =  Window.partitionBy(sums.projectcode, sums.page) \
            .orderBy(sums.day.asc()).rowsBetween(-1, -1)

    # on calcule la différence entre views(d) - views(d-1)
    diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
            .over(window_spec))

    # on calcule les coefs à appliquer à chaque jour
    coefs = pd.DataFrame({'day': range(day-30, day)})
    coefs['coef'] = 1. / (day - coefs.day)

    coefs = hc.createDataFrame(coefs)
    diffs = diffs.join(coefs, 'day')

    # on calcul le score de chaque jour
    diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)

    totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
    # on normalise par la racine de la somme des views 
    totals = totals.withColumn('score',
            totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
            .orderBy(F.desc('score')) \
            .withColumnRenamed('SUM(views)', 'total_views') \
            .limit(10)

    views = sums.select('projectcode', 'page', 'day', 'views') \
           .join(totals.select('projectcode', 'page', 'total_views', 'score'), 
                  (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')

    df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
    df2 = views.toPandas()
    df2 = df2.iloc[:, 2:]
    df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
    df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
    df.to_csv(filename(day), index=False)
    
    # on vide le cache
    hc.clearCache()
Exemplo n.º 5
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
Exemplo n.º 6
0
                where B.flight in (select flight from (select if(isnull(flight), icao, if(flight="00000000",icao, \
                flight)) as flight, lon, lat, altitude from adsb) as A where (lat > '                                                                                      + minLat + ' and lat < ' + \
                maxLat + ' and lon > ' + minLon + ' and lon < ' + maxLon + '  and altitude < ' + altitude + ') group \
                by flight) order by UTC'                                        )

    adsbTable = adsbTable.dropDuplicates(['flight', 'lon', 'lat'])

    # the next set of lines will handle the scenario where the same flight number from one airport arrives at another
    # airport, then departs to another airport we do not want to treat this as one flight, rather multiple flights, for
    # example, if we have flight ACA108 that comes in from Montreal to Toronto then leaves for Winnipeg, we will have
    # ACA108-0 and ACA108-1
    #
    # if there was just one flight number, then only -0 is appended

    # partition the flight by UTC
    window_flightNum_UTC = Window.partitionBy("flight").orderBy("UTC")

    # calculate the deltaTime between the previous and current coordinate for each flight
    adsbDf=adsbTable.withColumn("deltaTime", psf.unix_timestamp('UTC') - psf.lag(psf.unix_timestamp('UTC')). \
        over(window_flightNum_UTC))

    # create a trigger when the time elapse from the previous coordinates to the next is more than 1200 seconds
    # (20 minutes) note that this isn't foolproof, if there is gaps in your coverage that is 20 minutes or longer then
    # this will break that one flight into two separate flights, adjust the 1200 seconds below to suit your
    # requirements
    adsbDf1 = adsbDf.withColumn(
        "trigger",
        psf.when(adsbDf.deltaTime > 1200, 1).otherwise(0))

    # create the suffix for the flight by using the cumulative sum of the triggers for each flight
    adsbDf2 = adsbDf1.withColumn(
Exemplo n.º 7
0
def main(sentiment_input, user_input, review_input, model_input,
         output_folder):
    # read input files
    df_sentiment = spark.read.csv(sentiment_input, header=True)
    df_user = spark.read.parquet(user_input)
    df_review = spark.read.parquet(review_input)

    # get 50 users
    df_50_users = df_user.limit(50)

    # cross join user and business
    df_usr_bus_all = df_50_users \
                    .crossJoin(df_sentiment) \
                    .where(df_sentiment['ZipCode'].isNull() == False) \
                            .select(
                                df_sentiment['BusinessID'], \
                                df_user['UserID'], \
                                df_user['UserName'], \
                                df_user['ReviewCount'].alias('UserReviewCount'), \
                                df_user['AverageStars'].alias('UserAverageStars'), \
                                functions.lit(0).alias('ReviewStars'), \
                                functions.dayofyear(functions.current_date()).alias('ReviewDayOfYear'), \
                                df_sentiment['Name'].alias('BusinessName'), \
                                df_sentiment['ZipCode'].alias('BusinessPostalCode'), \
                                df_sentiment['ZipCode'].substr(1, 3).alias('BusinessNeighborhood'), \
                                df_sentiment['Latitude'].cast(types.FloatType()), \
                                df_sentiment['Longitude'].cast(types.FloatType()), \
                                df_sentiment['avg_neg'].cast(types.FloatType()).alias('AverageNegative'), \
                                df_sentiment['avg_neu'].cast(types.FloatType()).alias('AverageNeutral'), \
                                df_sentiment['avg_pos'].cast(types.FloatType()).alias('AveragePositive'), \
                                df_sentiment['avg_composite_score'].cast(types.FloatType()).alias('AverageComposite'))

    # left join with reviews
    df_joined = df_usr_bus_all.join(df_review, ['BusinessID', 'UserID'], 'left_outer') \
                            .select(df_review['ReviewID'], \
                                    df_usr_bus_all['BusinessID'], \
                                    df_usr_bus_all['UserID'], \
                                    df_usr_bus_all['UserName'], \
                                    df_usr_bus_all['UserReviewCount'], \
                                    df_usr_bus_all['UserAverageStars'], \
                                    df_usr_bus_all['ReviewStars'], \
                                    df_usr_bus_all['ReviewDayOfYear'], \
                                    df_usr_bus_all['BusinessName'], \
                                    df_usr_bus_all['BusinessPostalCode'], \
                                    df_usr_bus_all['BusinessNeighborhood'], \
                                    df_usr_bus_all['Latitude'], \
                                    df_usr_bus_all['Longitude'], \
                                    df_usr_bus_all['AverageNegative'], \
                                    df_usr_bus_all['AverageNeutral'], \
                                    df_usr_bus_all['AveragePositive'], \
                                    df_usr_bus_all['AverageComposite'])

    # get restaurants that user has not visited
    df_not_visited_rests = df_joined.where(df_joined['ReviewID'].isNull())

    # load the model
    loaded_model = PipelineModel.load(model_input)

    # use the model to make predictions
    predictions = loaded_model.transform(df_not_visited_rests)
    predictions_init = predictions.select(predictions['BusinessID'], \
                                          predictions['BusinessName'], \
                                          predictions['BusinessPostalCode'], \
                                          predictions['BusinessNeighborhood'], \
                                          predictions['UserID'], \
                                          predictions['UserName'], \
                                          predictions['UserReviewCount'], \
                                          predictions['UserAverageStars'], \
                                          predictions['ReviewDayOfYear'], \
                                          predictions['prediction'].alias('PredictedReviewStar'), \
                                          predictions['Latitude'], \
                                          predictions['Longitude'], \
                                          predictions['AverageNegative'], \
                                          predictions['AverageNeutral'], \
                                          predictions['AveragePositive'], \
                                          predictions['AverageComposite'])

    # change scores > 5 to 5 and < 0 to 0
    predictions_final = predictions_init.withColumn('FinalStar', \
                                                        functions.when(predictions_init["PredictedReviewStar"] >= 5, 5) \
                                                        .otherwise(functions.when(predictions_init["PredictedReviewStar"] <= 0, 0) \
                                                        .otherwise(predictions_init['PredictedReviewStar'])))

    # partition By user
    window = Window.partitionBy(predictions_final['UserID']).orderBy(
        predictions_final['FinalStar'].desc())

    # get top 10 scores for each user based on partition
    prediction_to_save = predictions_final.select(
        '*',
        functions.row_number().over(window).alias('rank')).filter(
            col('rank') <= 10)

    # save predictions to output
    prediction_to_save.coalesce(1).write.csv(output_folder + '/TestModel',
                                             header=True)
 def growing_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(Window.unboundedPreceding, 4)
 def sliding_range_window(self):
     return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
Exemplo n.º 10
0
def _generate_invoice(sc, sqlContext, **kwargs):
    from transform._invoice_latest import _get_invoice_data
    from transform._prediction_list import _get_models_list, _get_prediction_list
    from properties import FINAL_PREDICTION_LOCATION, _PREDICTION_LOCATION, TESTING, REPARTITION_VAL
    from transform._remainder_addition import _get_remainder

    # Get visit list as an argument
    _visit_list = kwargs.get('visit_list')

    # Get order date as an argument -- Used for simulation purpose now. Should be changed to current system date ideally
    order_date = kwargs.get('order_date')

    # Filters out the visit list for the provided order_date
    vl_df = _get_visit_list(sc=sc,
                            sqlContext=sqlContext,
                            order_date=order_date,
                            visit_list=_visit_list)

    # # Used to the authorised material list for a particular 'vkbur'
    # aglu_df = _get_aglu_list(sc=sc, sqlContext=sqlContext)

    vl_df.registerTempTable("vl_sql")

    # # query to join the visit list with authorised material list to obtain a complete set of products for all scheduled visits.
    # # TODO: Clarify is there is any method to isolate the materials for a particular visit

    # aglu_df.select(col('MATNR').alias('mat_no'), col('scl_auth_matlst')).registerTempTable("aglu_sql")

    # q = """
    # select e.*, f.mat_no mat_no
    # from
    # (
    # select b.customernumber customernumber, b.order_date order_date, d.scl_auth_matlst scl_auth_matlst, d.vkbur vkbur, IF(d.vsbed == '01', 2, 1) dlvry_lag
    # from
    # (
    # select a.customernumber customernumber, a.order_date order_date
    # from vl_sql a
    # ) b
    # join
    # (
    # select c.kunnr customernumber, c.scl_auth_matlst scl_auth_matlst, c.vkbur vkbur, c.vsbed vsbed
    # from mdm.customer c
    # ) d
    # on d.customernumber = b.customernumber
    # ) e
    # cross join
    # (
    # select g.mat_no mat_no, g.SCL_AUTH_MATLST scl_auth_matlst
    # from aglu_sql g
    # ) f
    # where e.scl_auth_matlst = f.scl_auth_matlst
    # """

    # Query to join the visit list to the customer master table to obtain delivery lag for a customer

    # # query to join the visit list with authorised material list to obtain a complete set of products for all scheduled visits.
    q = """
    select e.*
    from
    (
    select b.customernumber customernumber, b.mat_no mat_no, b.order_date order_date, d.vkbur vkbur, IF(d.vsbed == '01', 2, 1) dlvry_lag
    from
    (
    select a.customernumber customernumber, a.mat_no mat_no, a.order_date order_date
    from vl_sql a
    ) b
    join
    (
    select c.kunnr customernumber, c.vkbur vkbur, c.vsbed vsbed
    from mdm.customer c
    ) d
    on d.customernumber = b.customernumber
    ) e
    """

    # # Obtaining delivery_date from given order_date provided the delivery_lag
    visit_list_final = sqlContext.sql(q) \
        .repartition(REPARTITION_VAL) \
        .withColumn('delivery_date', udf(_get_delivery_date, StringType())(col('order_date'), col('dlvry_lag'))) \
        .drop(col('dlvry_lag'))

    print("Visit List Final")
    # visit_list_final.show()

    #
    invoice_raw = _get_invoice_data(sqlContext=sqlContext,
                                    CRITERIA_DATE=get_criteria_date(order_date=order_date)) \
        .repartition(REPARTITION_VAL)

    print("Invoice_raw")
    # invoice_raw.show()

    visit_invoice_condition = [
        visit_list_final.customernumber == invoice_raw.customernumber,
        visit_list_final.mat_no == invoice_raw.mat_no
    ]

    visit_list_final_join_invoice_raw = visit_list_final \
        .join(invoice_raw, on=visit_invoice_condition, how='inner') \
        .repartition(REPARTITION_VAL) \
        .select(visit_list_final.customernumber,
                visit_list_final.mat_no,
                visit_list_final.order_date,
                visit_list_final.vkbur,
                visit_list_final.delivery_date,
                invoice_raw.last_delivery_date) \
        .withColumn('mod_last_delivery_date', udf(_get_tweaked_last_del_dt, StringType())(col('last_delivery_date'))) \
        .drop(col('last_delivery_date'))

    print("Visit list final join invoice raw")
    # visit_list_final_join_invoice_raw.show()
    # print(visit_list_final_join_invoice_raw.count())
    # visit_list_final_join_invoice_raw.printSchema()

    models_data_raw = _get_models_list(sc=sc, sqlContext=sqlContext, CRITERIA_DATE=order_date,
                                       testing=TESTING) \
        .repartition(REPARTITION_VAL)
    print("Prediction Data")
    # models_data_raw.show()
    # print(models_data_raw.count())
    # models_data_raw.printSchema()

    visit_pred_condition = [
        visit_list_final_join_invoice_raw.customernumber ==
        models_data_raw.customernumber,
        visit_list_final_join_invoice_raw.mat_no == models_data_raw.mat_no
    ]

    _final_df_stage = visit_list_final_join_invoice_raw \
        .join(models_data_raw, on=visit_pred_condition, how='inner') \
        .repartition(REPARTITION_VAL) \
        .select(visit_list_final_join_invoice_raw.customernumber,
                visit_list_final_join_invoice_raw.mat_no,
                visit_list_final_join_invoice_raw.order_date,
                visit_list_final_join_invoice_raw.vkbur,
                visit_list_final_join_invoice_raw.delivery_date,
                visit_list_final_join_invoice_raw.mod_last_delivery_date,
                models_data_raw.mdl_bld_dt,
                models_data_raw.cutoff_date
                )

    print("Final_df_stage")
    # _final_df_stage.show()

    ########################################
    from pyspark.sql.window import Window
    import sys
    ########################################

    cust_pdt_mdl_bld_dt_window = Window \
        .partitionBy(_final_df_stage.customernumber, _final_df_stage.mat_no) \
        .orderBy(_final_df_stage.mdl_bld_dt) \
        .rangeBetween(-sys.maxsize, sys.maxsize)

    # valid_model_flag = (udf(_is_model_valid, BooleanType())(col('delivery_date'), col('mod_last_delivery_date'), col('mdl_bld_dt'), max(col('mdl_bld_dt').filter(lambda _date: _date < col('mod_last_delivery_date'))).alias('min_mdl_bld_dt')))

    # # TODO: Update 1 here to account for cutoff_date modification as discussed.
    _final_df = _final_df_stage \
        .repartition(REPARTITION_VAL) \
        .select(col('customernumber'),
                col('mat_no'),
                col('order_date'),
                col('vkbur'),
                col('delivery_date'),
                col('mod_last_delivery_date'),
                col('mdl_bld_dt'),
                col('cutoff_date'),
                (max(udf(_get_max_date_spprt_func_1, DateType())(col('mdl_bld_dt'),
                                                                 col('mod_last_delivery_date')))).over(
                    window=cust_pdt_mdl_bld_dt_window).cast(StringType()).alias('min_mdl_bld_dt')
                ) \
        .withColumn('mdl_validity',
                    udf(_is_model_valid, BooleanType())(col('delivery_date'), col('mod_last_delivery_date'),
                                                        col('mdl_bld_dt'), col('min_mdl_bld_dt'))) \
        .filter(col('mdl_validity') == True) \
        .select(col('customernumber'),
                col('mat_no'),
                col('order_date'),
                col('vkbur'),
                col('delivery_date'),
                col('mod_last_delivery_date'),
                col('mdl_bld_dt'),
                col('cutoff_date'),
                (max(udf(string_to_gregorian, DateType())(col('cutoff_date')))).over(
                    window=Window.partitionBy("customernumber", "mat_no", "mdl_bld_dt").orderBy(
                        "cutoff_date").rangeBetween(-sys.maxsize, sys.maxsize)).cast(
                    StringType()).alias('latest_cutoff_date')
                ) \
        .filter(col('cutoff_date') == col('latest_cutoff_date'))

    print("Final_df")
    # _final_df.show()

    _prediction_df_raw = _get_prediction_list(sqlContext=sqlContext, testing=TESTING) \
        .repartition(REPARTITION_VAL)

    print("prediction_df_raw")
    # _prediction_df_raw.show()

    _final_df_prediction_df_raw_condition = [
        _final_df.customernumber == _prediction_df_raw.customernumber,
        _final_df.mat_no == _prediction_df_raw.mat_no,
        _final_df.mdl_bld_dt == _prediction_df_raw.mdl_bld_dt,
        _final_df.cutoff_date == _prediction_df_raw.cutoff_date
    ]

    _temp_df = _prediction_df_raw \
        .join(_final_df, on=_final_df_prediction_df_raw_condition, how='inner') \
        .repartition(REPARTITION_VAL) \
        .select(_prediction_df_raw.customernumber,
                _prediction_df_raw.mat_no,
                _prediction_df_raw.pred_val,
                _prediction_df_raw.pdt_cat,
                _final_df.order_date.cast(StringType()),
                _final_df.delivery_date,
                _final_df.mod_last_delivery_date,
                _final_df.mdl_bld_dt,
                _final_df.cutoff_date
                ) \
        .withColumn('scale_denom', when(col('pdt_cat').isin('I', 'II', 'III', 'VII'), 7).otherwise(31))

    print("_temp_df")
    # _temp_df.show()
    # _temp_df.printSchema()

    _temp_df_rdd_mapped = _temp_df.flatMap(
        lambda _row: map_pred_val_to_week_month_year(_row))
    # _temp_df_rdd_mapped = _temp_df.map(lambda _row: _row)

    # print _temp_df_rdd_mapped.take(1)

    # cust_pdt_week_month_window = Window \
    #     .partitionBy(_final_df_stage.customernumber, _final_df_stage.mat_no) \
    #     .orderBy(_final_df_stage.mdl_bld_dt) \
    #     .rangeBetween(-sys.maxsize, sys.maxsize)

    # # TODO: Update 2 here to account for cutoff_date modification as discussed.
    _temp_df_flat = sqlContext.createDataFrame(_temp_df_rdd_mapped, schema=_pred_val_to_week_month_year_schema()) \
        .repartition(REPARTITION_VAL) \
        .select(col('customernumber'),
                col('mat_no'),
                col('week_month_key'),
                col('pred_val'),
                col('pdt_cat'),
                col('delivery_date'),
                col('mod_last_delivery_date').alias('last_delivery_date'),
                col('mdl_bld_dt'),
                col('cutoff_date'),
                col('scale_denom'),
                (max(udf(string_to_gregorian, DateType())(col('mdl_bld_dt')))).over(
                    window=Window.partitionBy("customernumber", "mat_no", "week_month_key").orderBy(
                        "mdl_bld_dt").rangeBetween(-sys.maxsize, sys.maxsize)).cast(
                    StringType()).alias('updt_mdl_bld_dt')
                ) \
        .filter(col('mdl_bld_dt') == col('updt_mdl_bld_dt')) \
        .withColumn('_partial_diff', udf(_get_partial_date_diff, IntegerType())(col('week_month_key'), col('pdt_cat'),
                                                                                col('delivery_date'),
                                                                                col('last_delivery_date'))) \
        .withColumn('_partial_quantity_temp', (col('pred_val') / col('scale_denom')) * col('_partial_diff')) \
        .withColumn('_partial_quantity', round(col('_partial_quantity_temp'), 2)) \
        .withColumn('identification_params', udf(_get_string_from_cols, StringType())(col('week_month_key'),
                                                                                      col('pred_val'),
                                                                                      col('pdt_cat'),
                                                                                      col('mdl_bld_dt'),
                                                                                      col('cutoff_date'),
                                                                                      col('_partial_diff'),
                                                                                      col('_partial_quantity'))) \
        .drop(col('_partial_quantity_temp')) \
        .drop(col('mdl_bld_dt'))

    print("temp_df_flat")
    # _temp_df_flat.show()

    _remainder_df = _get_remainder(sqlContext=sqlContext, testing=TESTING) \
        .repartition(REPARTITION_VAL)

    print("remainder_df")
    # _remainder_df.show()

    print("result_df_stage")
    result_df_stage = _temp_df_flat \
        .repartition(REPARTITION_VAL) \
        .select(col('customernumber'), col('mat_no'), col('delivery_date'), col('_partial_quantity'),
                col('identification_params')) \
        .withColumn('order_date', lit(order_date)) \
        .groupBy(col('customernumber'), col('mat_no'), col('order_date'), col('delivery_date')) \
        .agg(
        round(sum(col('_partial_quantity')), 2).alias('quantity_stg'),
        collect_list(col('identification_params')).alias('identification_param_set')
    )

    # result_df_stage.show(5)
    # result_df_stage.printSchema()

    print("Writing raw invoice to HDFS")
    result_df_stage \
        .write.mode('append') \
        .format('orc') \
        .option("header", "false") \
        .save(_PREDICTION_LOCATION)

    result_df_stage_remainder_df_cond = [
        result_df_stage.customernumber == _remainder_df.customernumber,
        result_df_stage.mat_no == _remainder_df.mat_no
    ]

    result_df = result_df_stage \
        .join(_remainder_df, on=result_df_stage_remainder_df_cond, how='left_outer') \
        .repartition(REPARTITION_VAL) \
        .select(result_df_stage.customernumber,
                result_df_stage.mat_no,
                result_df_stage.order_date,
                result_df_stage.delivery_date,
                result_df_stage.quantity_stg,
                _remainder_df.remainder.alias('carryover')
                ) \
        .na.fill(0.0) \
        .withColumn('quantity_temp', round((col('quantity_stg') + col('carryover')), 2).cast(FloatType())) \
        .withColumn('quantity', floor(col('quantity_temp')).cast(FloatType())) \
        .withColumn('remainder', round((col('quantity_temp') - col('quantity')), 2).cast(FloatType())) \
        .select(col('customernumber'), col('mat_no'), col('order_date'), col('delivery_date'), col('quantity'),
                col('remainder'))

    # .withColumn('carryover', when(col('carryover_stg') == None, 0.0).otherwise(col('carryover_stg')).cast(FloatType()))\

    # .drop(col('carryover_stg')) \

    print("result_df")
    # result_df.cache()
    # result_df.show()
    # print(result_df.count())

    print("Writing invoice to HDFS")
    result_df \
        .write.mode('append') \
        .format('orc') \
        .option("header", "false") \
        .save(FINAL_PREDICTION_LOCATION)
Exemplo n.º 11
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)
Exemplo n.º 12
0
 def ordered_window(self):
     return Window.partitionBy('id').orderBy('v')
Exemplo n.º 13
0
 def unpartitioned_window(self):
     return Window.partitionBy()
Exemplo n.º 14
0
 def ordered_window(self):
     return Window.partitionBy('id').orderBy('v')
Exemplo n.º 15
0
 def unbounded_window(self):
     return Window.partitionBy('id') \
         .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
Exemplo n.º 16
0
def main(inputFile, outputFile):
    def getItemName(payload):
        payload = json.loads(payload)
        return payload['item_name'].split('.')[0]

    item_udf = udf(lambda payload: getItemName(payload))
    df = spark.read.parquet(inputFile+'/*')

     #DF for Item Play Started
    start = df.filter(col('event').isin(['ITEM_PLAY_STARTED']))
    start = start.withColumn('Content Name', item_udf(start['payload'])) # get content name
    start = start.withColumn('Time Start', df['time'].cast("timestamp")) #turn Time Start String to easy to use Time Stamp Object
    cols = ["device_id", "Content Name", "Time Start"]    #Select just the columns we need
    start = start.select(*cols)
    
    def getEndOfStream(payload):
        payload = json.loads(payload)
        if "did_reach_end_of_stream" in payload:
            return payload['did_reach_end_of_stream']
        else:
            return "false"

    stream_end_UDF = udf(lambda x: getEndOfStream(x))
        
        #DF for Item Play Finished
    finished = df.filter(col('event').isin(['ITEM_PLAY_FINISHED'])) 
    finished = finished.withColumn('Content Name', item_udf(finished['payload'])) # get content name
    finished = finished.withColumn('reach_end_of_stream', stream_end_UDF(finished['payload'])) # get did_reach_end_of_stream        
    finished = finished.withColumn("reach_end_of_stream", F.trim(col("reach_end_of_stream"))) #Get rid of white space

        
        #Convert True/False strings to actual boolean values
    finished = finished.withColumn(
    'reach_end_of_stream',
    F.when(F.col("reach_end_of_stream") == "true", True)\
    .otherwise(False)
    )
        #turn Time End String to easy to use Time Stamp Object
    finished = finished.withColumn('Time End', df['time'].cast("timestamp"))
    #Select just the columns we need
    cols = ["device_id", "Content Name", "Time End", "reach_end_of_stream"]
    finished = finished.select(*cols)
        
        #combine two dataframes for our transformed Schema
    transformed = start.join(finished, on=["device_id", "Content Name"], how='left_outer')
    
    #Make sure Time Start before time end
    transformed = transformed.where(col("Time Start") <= col("Time End"))
    
    #Convert time stamps to unix
    #transformed = transformed.withColumn('Time Start', F.unix_timestamp('Time Start'))
    #transformed = transformed.withColumn('Time End', F.unix_timestamp('Time End'))
    
    #Get correct Time Ends
    def getEndTime(end_time_list):
        return end_time_list[0]
    
    end_time_udf = udf(lambda end_time_list: end_time_list[0], T.TimestampType())
    df = transformed.withColumn("end_time_list", F.collect_list("Time End").over(Window.partitionBy("device_id",'Content Name','Time Start', "reach_end_of_stream").orderBy('Time End')))
    df = df.groupBy('device_id','Time Start','Content Name', "reach_end_of_stream").agg(F.max('end_time_list').alias('end_time_list'))
    #Still gets laggy here running the udf that takes first item of list (aka the smallest date time)
    df = df.withColumn('Time End', end_time_udf("end_time_list"))
    df = df.drop('end_time_list')
    
    #rename columns + reorder
    df = df.withColumnRenamed("Time Start", "start").withColumnRenamed("Time End", "end").withColumnRenamed("Content Name", "item_name")
    df = df.select("device_id", "item_name", "start", "end", "reach_end_of_stream")

    df.write.parquet(outputFile) # Write onto output Parquet
Exemplo n.º 17
0
    def predict_using_cosine_similarity(self,
                                        input_df,
                                        predict_course_df=None):
        # preprocessed input data
        # print("begin predict using cosine similarity")
        encoded_df = self.indexer_model.transform(input_df)
        normalize_rating_udf = udf(lambda p: 0.0
                                   if p > 10 else p, DoubleType())
        encoded_df = encoded_df.withColumn(
            self.rating_col_name,
            normalize_rating_udf(encoded_df[self.rating_col_name]))

        # get predict course df (remaining course)
        if predict_course_df is None:
            predict_course_df_predict = encoded_df.join(self.item_index_df,
                                                        encoded_df[self.item_col_name_index] != self.item_index_df[
                                                            self.item_col_name_index]) \
                .select(self.user_col_name, self.item_col_name_index)
        else:
            predict_course_df = self.indexer_model.transform(predict_course_df)
            predict_course_df_predict = predict_course_df.drop(
                self.rating_col_name)

        # get all value that can participate in evaluate final score
        similarity_score_df = encoded_df.join(self.item_similarity,
                                              encoded_df[self.item_col_name_index] == self.item_similarity['id1']) \
            .select(self.user_col_name, self.rating_col_name, 'id1', 'id2', 'similarity') \
            # .withColumnRenamed(self.user_col_name, "user_name_similarity")

        #                encoded_df[self.item_col_name_index] == self.item_similarity['id2']) # can delete this part if allow duplicate id1,id2

        # def predict(student, course, similarity_score_df):
        #     # get first 5 course the student already attended which are the most relevant to the current course
        #     relevant_df = similarity_score_df.filter(similarity_score_df[self.user_col_name] == student and
        #                                              similarity_score_df['id2'] == course) \
        #         .orderBy('similarity', ascending=False) \
        #         .head(5)
        #     relevant_df = relevant_df.withColumn('score', relevant_df[self.rating_col_name] * relevant_df['similarity'])
        #     return relevant_df.select(spark_func.avg(relevant_df['score']).alias('avg')).collect()[0][
        #         'avg']  # need to check again if avg is enough
        def predict(list_score, list_similarity):
            sum_simi = sum(list_similarity)
            if sum_simi == 0:
                return 0.0
            return sum([
                list_score[i] * list_similarity[i]
                for i in range(len(list_score))
            ]) / sum(list_similarity)

        predict_udf = udf(predict, DoubleType())
        window = Window.partitionBy([
            spark_func.col(self.user_col_name),
            spark_func.col(self.item_col_name_index)
        ]).orderBy(spark_func.col('similarity').desc())

        predict_course_df_predict = predict_course_df_predict.join(
            similarity_score_df.withColumnRenamed("id2", self.item_col_name_index),
            [self.item_col_name_index, self.user_col_name]) \
            .select("*", spark_func.rank().over(window).alias("rank")) \
            .filter(spark_func.col("rank") <= 7).groupby(self.user_col_name, self.item_col_name_index) \
            .agg(spark_func.collect_list(self.rating_col_name).alias("list_score"),
                 spark_func.collect_list("similarity").alias("list_similarity"))
        predict_course_df_predict = predict_course_df_predict.withColumn(
            "prediction",
            predict_udf(spark_func.col("list_score"),
                        spark_func.col("list_similarity")))

        if predict_course_df is not None and self.rating_col_name in predict_course_df.columns:
            predict_course_df_predict = predict_course_df_predict.join(
                predict_course_df,
                [self.user_col_name, self.item_col_name_index])

        return predict_course_df_predict
Exemplo n.º 18
0
#Récupération de la matrice de types sous forme de dataframe
#df_types=dtf.typesToDf()
#Spark DataFrame
#df_types=spark.createDataFrame(df_types)
#Récupération de la matrice de similarité sous forme de dataframe
#df_similarities=dtf.similaritiesToDf()
#Spark DataFrame
#df_similarities=spark.createDataFrame(df_similarities)
#Récupération de la matrice de placeTypes sous forme de dataframe
#df_placeTypes=dtf.placeTypesToDf()
#Spark DataFrame
#df_placeTypes=spark.createDataFrame(df_placeTypes)
#Récupération de la matrice de placesSimilarity sous forme de dataframe
#Spark DataFrame
#placesSimilarities=spark.read.format('csv').option('header', 'true').load('../data/placesSimilarities.csv')
windowSpec = Window.orderBy("ind")
df_cities = dtf.citiesToDf()
df_types = dtf.typesToDf()
df_placeTypes = dtf.placeTypesToDf()
df_similarities = dtf.similaritiesToDf()
df_places = dtf.placesToDf()
sc_places = spark.createDataFrame(df_places)
user_tags = []
d = {'Visits': []}
df_visits = pd.DataFrame(data=d)
d = {'City_id': []}
df_city = pd.DataFrame(data=d)
df_types = dtf.typesToDf()
#n=len(tab_tags)
n_pT = len(df_placeTypes)
#Ajout des city_id à la matrice
Exemplo n.º 19
0
def preprocess(spark, file_path, percent=None):
    '''
	Split data to train, val and test.
	file_path: the directory where we want to read preprocessed parquet file 
							and store to be splitted parquet files.
							e.g., hdfs:/user/ym1970/.
	percent: the ratio of downsampling, such as 1, 5 or 25 as suggested in instructions.
	'''

    df = spark.read.parquet(file_path +
                            'interactions_preprocessed_repa.parquet')
    print('Start downsampling...')
    if percent:
        df_downsample_id, _ = [
            i for i in df.select('user').distinct().randomSplit(
                [percent / 100, 1 - percent / 100], 123)
        ]
        df = df.join(df_downsample_id, 'user', 'left_semi')

    print('Start splitting...')

    df_train_id, df_val_id, df_test_id = [
        i
        for i in df.select('user').distinct().randomSplit([0.6, 0.2, 0.2], 123)
    ]

    print('Select records based on user id...')
    df_train = df.join(df_train_id, 'user', 'left_semi')
    df_val = df.join(df_val_id, 'user', 'left_semi')
    df_test = df.join(df_test_id, 'user', 'left_semi')

    print('Sample user id to be moved to train...')
    window = Window.partitionBy('user').orderBy('item')
    df_val_window = (df_val.select(
        'user', 'item', 'rating',
        F.row_number().over(window).alias('row_number')))
    df_test_window = (df_test.select(
        'user', 'item', 'rating',
        F.row_number().over(window).alias('row_number')))

    print('Move to train...')
    df_val_to_train = df_val_window.filter(
        df_val_window.row_number % 2 == 1).select('user', 'item', 'rating')
    df_test_to_train = df_test_window.filter(
        df_test_window.row_number % 2 == 1).select('user', 'item', 'rating')

    df_train = df_train.union(df_val_to_train).union(df_test_to_train)

    df_val = df_val.subtract(df_val_to_train)
    df_test = df_test.subtract(df_test_to_train)

    print('Delete based on book id...')
    df_val = df_val.join(df_train, 'item', 'left_semi')
    df_test = df_test.join(df_train, 'item', 'left_semi')

    print('Write to parquet...')
    df_train.write.parquet(file_path + 'interactions_train_' + str(percent) +
                           '.parquet')
    df_val.write.parquet(file_path + 'interactions_val_' + str(percent) +
                         '.parquet')
    df_test.write.parquet(file_path + 'interactions_test_' + str(percent) +
                          '.parquet')
Exemplo n.º 20
0
 def sliding_range_window(self):
     return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    is_dev = True

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    #------------------------------------------------------------------------------------------------------------------#

    # ------------------------------------------------------------------------------------------------------------------#

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = f.udf(concaText, StringType())

    def convertStudentIdToLong(student_id):
        try:
            student_id_long = long(student_id)
            return student_id_long
        except:
            return 0L

    convertStudentIdToLong = f.udf(convertStudentIdToLong, LongType())
    # ------------------------------------------------------------------------------------------------------------------#

    ##################################################
    # Lay du lieu kiem tra ky thuat trong bang student_technical_test
    dyf_datasourceTech = glueContext.create_dynamic_frame.from_catalog(
        database="dm_toa", table_name="student_technical_test_odin")

    # print('dyf_datasourceTech')
    # dyf_datasourceTech.printSchema()

    # Chon cac truong can thiet
    dyf_datasourceTech = dyf_datasourceTech.select_fields([
        '_key', 'thoigianhenktkt', 'ketluan', 'emailhocvien', 'dauthoigian',
        'emailadvisor', 'nguoitiepnhan', 'trinhdohocvien'
    ])

    dyf_datasourceTech = dyf_datasourceTech.resolveChoice(
        specs=[('_key', 'cast:long')])

    if (dyf_datasourceTech.count() > 0):
        dyf_datasourceTech = Filter.apply(
            frame=dyf_datasourceTech,
            f=lambda x: x["emailhocvien"] is not None and x["emailhocvien"] !=
            '' and x["thoigianhenktkt"] is not None and x[
                "thoigianhenktkt"] != '' and x["ketluan"] == 'Pass')

        dyf_datasourceTech_numeber = dyf_datasourceTech.count()
        print("Count data 2:  ", dyf_datasourceTech_numeber)

        if dyf_datasourceTech_numeber < 1:
            return

        dy_datasourceTech = dyf_datasourceTech.toDF()

        dy_datasourceTech = dy_datasourceTech.limit(100)

        print('dy_datasourceTech')
        dy_datasourceTech.printSchema()
        dy_datasourceTech = dy_datasourceTech.withColumn(
            'thoigianhenktkt_id',
            f.unix_timestamp('thoigianhenktkt',
                             'yyyy-MM-dd HH:mm:ss').cast('long'))

        print('dy_datasourceTech__2')
        dy_datasourceTech.printSchema()

        # lay thoi gian kich hoat dau tien
        w2 = Window.partitionBy("emailhocvien").orderBy(
            f.col("thoigianhenktkt_id").desc())
        dy_datasourceTech = dy_datasourceTech.withColumn("row", f.row_number().over(w2)) \
            .where(f.col('row') <= 1) \

        print('dy_datasourceTech__3')
        dy_datasourceTech.printSchema()

        #--------------------------------------------------------------------------------------------------------------#
        dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="student_contact")

        # chon cac field
        dyf_student_contact = dyf_student_contact.select_fields(
            ['_key', 'contact_id', 'student_id', 'user_name'])

        dyf_student_contact = Filter.apply(
            frame=dyf_student_contact,
            f=lambda x: x["contact_id"] is not None and x["contact_id"] != ''
            and x["student_id"] is not None and x["student_id"] != '' and x[
                "user_name"] is not None and x["user_name"] != '')

        dyf_student_contact_number = dyf_student_contact.count()
        print('dyf_student_contact_number::number: ',
              dyf_student_contact_number)
        if dyf_student_contact_number < 1:
            return

        dy_student_contact = dyf_student_contact.toDF()
        dy_student_contact.dropDuplicates(['student_id'])

        dy_join_teach_concat = dy_datasourceTech.join(
            dy_student_contact,
            dy_datasourceTech.emailhocvien == dy_student_contact.user_name)

        print('dyf_join_teach_concat::schema')
        dy_join_teach_concat.printSchema()

        join_teach_concat_number = dy_join_teach_concat.count()
        print('join_teach_concat_number::number: ', join_teach_concat_number)
        if join_teach_concat_number < 1:
            return

        #--------------------------------------------------------------------------------------------------------------#

        dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior", table_name="student_status")

        dyf_student_package_status = dyf_student_package_status\
            .select_fields(['contact_id', 'status_code', 'start_date', 'end_date'])\
            .rename_field('contact_id', 'contact_id_ps')

        print('dyf_student_package_status::drop_duplicates')

        df_student_package_status = dyf_student_package_status.toDF()
        print('dyf_student_package_status::drop_duplicates::before: ',
              df_student_package_status.count())
        df_student_package_status = df_student_package_status.drop_duplicates()
        print('dyf_student_package_status::drop_duplicates::after: ',
              df_student_package_status.count())

        print('dy_student_package_status')
        df_student_package_status.printSchema()
        # --------------------------------------------------------------------------------------------------------------#
        dyf_student_package = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior", table_name="student_package")

        print('dyf_student_package__0')
        dyf_student_package.printSchema()


        dyf_student_package = dyf_student_package \
            .select_fields(['student_id', 'package_code', 'start_time', 'end_time'])\
            .rename_field('student_id', 'student_id_pk')

        # --------------------------------------------------------------------------------------------------------------#

        print('dyf_student_package__1')
        dyf_student_package.printSchema()

        dyf_student_package = dyf_student_package.resolveChoice(
            specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')])

        print('dyf_student_package__2')
        dyf_student_package.printSchema()

        df_student_package = dyf_student_package.toDF()
        print('df_student_package::drop_duplicates::before: ',
              df_student_package.count())
        df_student_package = df_student_package.drop_duplicates()
        print('df_student_package::drop_duplicates::after: ',
              df_student_package.count())

        print('df_student_package')
        df_student_package.printSchema()
        df_student_package.show(3)

        df_student_package_number = df_student_package.count()
        print('df_student_package_number: ', df_student_package_number)

        # --------------------------------------------------------------------------------------------------------------#

        # --------------------------------------------------------------------------------------------------------------#

        dy_join_teach_concat_number = dy_join_teach_concat.count()
        print('dy_join_teach_concat_number: ', dy_join_teach_concat_number)

        join_result = dy_join_teach_concat\
            .join(df_student_package_status,
                 (dy_join_teach_concat.contact_id == df_student_package_status.contact_id_ps)
                 & (dy_join_teach_concat.thoigianhenktkt_id >= df_student_package_status.start_date)
                 & (dy_join_teach_concat.thoigianhenktkt_id < df_student_package_status.end_date),
                 'left'
                  )\
            .join(df_student_package,
                  (dy_join_teach_concat.student_id == df_student_package.student_id_pk)
                  &(dy_join_teach_concat.thoigianhenktkt_id >= df_student_package.start_time)
                  &(dy_join_teach_concat.thoigianhenktkt_id < df_student_package.end_time),
                  'left'
                  )

        print('join_result')
        join_result.printSchema()

        join_result_number = join_result.count()
        print('join_result_number: ', join_result_number)
        if join_result_number < 1:
            return
        join_result.show(3)

        student_id_unavailable = 0L
        package_endtime_unavailable = 99999999999L
        package_starttime_unavailable = 0L
        package_code_unavailable = 'UNAVAILABLE'
        student_level_code_unavailable = 'UNAVAILABLE'
        student_status_code_unavailable = 'UNAVAILABLE'
        measure1_unavailable = 0
        measure2_unavailable = 0
        measure3_unavailable = 0
        measure4_unavailable = float(0.0)

        # join_result = join_result.withColumnRenamed('student_id', 'student_id_a')

        join_result = join_result.select(
            join_result.thoigianhenktkt_id.alias('student_behavior_date'),
            f.lit(5L).alias('behavior_id'),
            'student_id',
            join_result.contact_id.alias('contact_id'),
            join_result.package_code.alias('package_code'),
            join_result.end_time.cast('long').alias('package_endtime'),
            join_result.start_time.cast('long').alias('package_starttime'),
            join_result.trinhdohocvien.cast('string').alias(
                'student_level_code'),
            join_result.status_code.cast('string').alias(
                'student_status_code'),
            f.lit(today_second).cast('long').alias('transformed_at'),
        )

        join_result = join_result.na.fill({
            'package_code':
            package_code_unavailable,
            'package_endtime':
            package_starttime_unavailable,
            'package_starttime':
            package_endtime_unavailable,
            'student_level_code':
            student_level_code_unavailable,
            'student_status_code':
            student_status_code_unavailable
        })

        print('join_result--1')
        join_result.printSchema()
        join_result.show(1)

        join_result = join_result.withColumn(
            'student_behavior_id',
            f.md5(
                concaText(join_result.student_behavior_date,
                          join_result.behavior_id, join_result.student_id,
                          join_result.contact_id, join_result.package_code,
                          join_result.package_endtime,
                          join_result.package_starttime,
                          join_result.student_level_code,
                          join_result.student_status_code,
                          join_result.transformed_at)))
        #
        print('join_result--2')
        join_result.printSchema()
        join_result.show(5)

        dyf_join_result = DynamicFrame.fromDF(join_result, glueContext,
                                              'dyf_join_result')

        dyf_join_result = Filter.apply(
            frame=dyf_join_result,
            f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')

        apply_ouput = ApplyMapping.apply(
            frame=dyf_join_result,
            mappings=[("student_behavior_id", "string", "student_behavior_id",
                       "string"),
                      ("student_behavior_date", "long",
                       "student_behavior_date", "long"),
                      ("behavior_id", "long", "behavior_id", "long"),
                      ("student_id", "string", "student_id", "long"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("package_code", "long", "package_code", "string"),
                      ("package_endtime", "long", "package_endtime", "long"),
                      ("package_starttime", "long", "package_starttime",
                       "long"),
                      ("student_level_code", "string", "student_level_code",
                       "string"),
                      ("student_status_code", "string", "student_status_code",
                       "string"),
                      ("transformed_at", "long", "transformed_at", "long")])

        dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
        #
        glueContext.write_dynamic_frame.from_options(
            frame=dfy_output,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")
Exemplo n.º 22
0
def create_test_data():
    df_train = spark.read.parquet(os.path.join("datasets", "train.parquet"))
    df_train.createOrReplaceTempView("data")
    max_id = spark.sql("""
    SELECT max(uid) as m FROM data
    """).first().m
    print("max_id", max_id)

    df_result = pd.read_csv(os.path.join("datasets", "sample_submission.csv"))
    files = list(df_result["seg_id"].values)

    schema = StructType([StructField("x", DoubleType(), True)])

    seg = 0
    for file in files:
        sep = "."
        if seg % 200 == 0: sep = "|"
        if seg % 20 == 0: print(sep, end="", flush=True)
        seg += 1
    print("", end="\n", flush=True)

    seg = 0
    df_test = None
    for file in files:
        #     print(file)
        if seg % 20 == 0: print("|", end="", flush=True)

        w1 = Window.orderBy("uid")
        w2 = Window.partitionBy("seg").orderBy("uid")
        df_temp = spark.read.csv(
            os.path.join("datasets", "test", file + ".csv"),
            header=True,
            schema=schema).withColumn(
                "y",
                lit(None).cast(DoubleType())).withColumn(
                    "uid",
                    lit(max_id + 1) +
                    monotonically_increasing_id()).withColumn(
                        "idx",
                        row_number().over(w1).cast(IntegerType())).withColumn(
                            "seg",
                            lit(seg).cast(IntegerType())).withColumn(
                                "no",
                                row_number().over(w2).cast(
                                    IntegerType())).withColumn(
                                        "name", (lit(file.split(".")[0])).cast(
                                            StringType())).withColumn(
                                                "set", lit(1))

        df_temp.createOrReplaceTempView("data")
        df_temp_f = spark.sql("""
        SELECT uid, set, seg, no, name, x, y FROM data
        ORDER BY set, seg, no, uid
        """)

        max_id = spark.sql("""
        SELECT max(uid) as m FROM data
        """).first().m

        seg += 1

        if df_test == None: df_test = df_temp_f
        else: df_test = df_test.union(df_temp_f)

        # create 1 file per 20 = I had issue when processing all in one go
        if seg % 20 == 0:
            file_name = "test_1_{:04}.parquet".format(seg)
            df_test = df_test.repartition(1)
            df_test.write.parquet(os.path.join("datasets", file_name))
            df_test = None
    #     if seg == 4 : break

    print("(", end="", flush=True)
    # left under 20 batch
    if df_test != None:
        file_name = "test_1_{:04}.parquet".format(seg)
        df_test = df_test.repartition(1)
        df_test.write.parquet(os.path.join("datasets", file_name))
        df_test = None
    print("x)", end="\n", flush=True)

    print("max_id", max_id)

    df_result = pd.read_csv(os.path.join("datasets", "sample_submission.csv"))
    files = list(df_result["seg_id"].values)

    seg = 0
    for file in files:
        sep = "."
        if seg % 200 == 0: sep = "|"
        if seg % 20 == 0: print(sep, end="", flush=True)
        seg += 1
    print("", end="\n", flush=True)

    seg = 0
    mode = "overwrite"
    for file in files:
        if seg % 20 == 0: print("|", end="", flush=True)
        seg += 1

        if seg % 20 == 0:
            file_name = "test_1_{:04}.parquet".format(seg)
            df_test = spark.read.parquet(os.path.join("datasets", file_name))
            df_test.write.mode(mode).parquet(
                os.path.join("datasets", "test.parquet"))
            mode = "append"

    print("(", end="", flush=True)

    # left under 20 batch
    if seg % 20 != 0:
        file_name = "test_1_{:04}.parquet".format(seg)
        df_test = spark.read.parquet(os.path.join("datasets", file_name))
        df_test.write.mode(mode).parquet(
            os.path.join("datasets", "test.parquet"))
        mode = "append"

    print("x", end="", flush=True)

    print(")", end="\n", flush=True)

    seg = 0
    for file in files:
        sep = "."
        if seg % 200 == 0: sep = "|"
        if seg % 20 == 0: print(sep, end="", flush=True)
        seg += 1
    print("", end="\n", flush=True)

    seg = 0
    for file in files:
        if seg % 20 == 0: print("|", end="", flush=True)
        seg += 1

        if seg % 20 == 0:
            file_name = "test_1_{:04}.parquet".format(seg)
            shutil.rmtree(os.path.join("datasets", file_name))

    print("(", end="", flush=True)
    # left under 20 batch
    if seg % 20 != 0:
        file_name = "test_1_{:04}.parquet".format(seg)
        shutil.rmtree(os.path.join("datasets", file_name))
    print("x", end="", flush=True)
    print(")", end="\n", flush=True)
Exemplo n.º 23
0
 def with_window_column(df):
     from pyspark.sql.window import Window
     from pyspark.sql.functions import percent_rank
     windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast'])
     return df.withColumn("r", percent_rank().over(windowSpec))
Exemplo n.º 24
0
    return (x1 + x2) / 2

multiple_col_avg_udf = udf(multiple_col_avg, DoubleType())

# using df
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(studentMarksData, ["id", "year", "score1", "score2"])
df.show()
avg_score_by_id_year = df.withColumn("avg_marks", multiple_col_avg_udf(col("score1").cast(IntegerType()),
                                                                       col("score2").cast(IntegerType())),)
avg_score_by_id_year.show(truncate=False)


# Calculate Top student using  rank function ( use case where we just need to calculte on the whole DF )
print("Calculate Top student using  rank function")
windowSpec = Window.orderBy(col("avg_marks"))
students_with_rank = avg_score_by_id_year.withColumn("rank" , rank().over(windowSpec))
students_with_rank.show()

sec_year_students_with_rank = avg_score_by_id_year.filter(col("year") == "year2").withColumn("rank" , rank().over(windowSpec))
sec_year_students_with_rank.filter(col("rank") <= 3).show()

sec_year_students_with_row_number = avg_score_by_id_year.filter(col("year") == "year2")\
    .withColumn("row_number", row_number().over(windowSpec))
sec_year_students_with_row_number.show()

# Calculate Top student using  rank function ( use case where we need to divide/partition the data on some col for example by year,
# or any other condition)
print("Window By Partition ")
windowSpec = Window.partitionBy(col("year")).orderBy(col("avg_marks").desc())
students_by_yearly_rank = avg_score_by_id_year.withColumn("rank", rank().over(windowSpec))
Exemplo n.º 25
0
    def create_dataset(self, sqlContext):
        for table in self.tables_dict:
            #             if (self.omop_ver == "6.0" and table == "person"):
            #                 self.tables_dict[table].append("death_datetime")
            self.tables_to_join[table] = self.loaded_tables[table].select(
                self.tables_dict[table])

        self.tables_to_join["history_aids"] = (
            self.tables_to_join["note_nlp"].where(
                (col("lexical_variant").rlike("(?i)history"))
                & ((col("lexical_variant").rlike("(?i)aids"))
                   | (col("lexical_variant").rlike("(?i)immunodeficiency"))
                   | (col("lexical_variant").rlike(
                       "(?i)acquired immune deficiency syndrome")))
                & (~col("lexical_variant").rlike("(?i)family"))).select(
                    col("note_id")).drop_duplicates().withColumn(
                        "history_aids",
                        lit(1)).join(self.tables_to_join["note"],
                                     how="left",
                                     on="note_id").select(
                                         col("person_id"), col("history_aids"),
                                         col("note_date")).groupby(
                                             col("person_id"),
                                             col("history_aids")).agg(
                                                 F.min("note_date").alias(
                                                     "history_aids_min_date")))

        self.tables_to_join["history_metastases"] = (
            self.tables_to_join["note_nlp"].where(
                (col("lexical_variant").rlike("(?i)history"))
                & ((col("lexical_variant").rlike("(?i)metastases"))
                   | (col("lexical_variant").rlike("(?i)metastatic"))
                   | (col("lexical_variant").rlike("(?i)metastasis"))
                   | (col("lexical_variant").rlike("(?i)secondary tumor"))
                   | (col("lexical_variant").rlike("(?i)secondary cancer")))
                & (~col("lexical_variant").rlike("(?i)family"))).select(
                    col("note_id")).drop_duplicates().withColumn(
                        "history_metastases",
                        lit(1)).join(self.tables_to_join["note"],
                                     how="left",
                                     on="note_id").select(
                                         col("person_id"),
                                         col("history_metastases"),
                                         col("note_date")).groupby(
                                             col("person_id"),
                                             col("history_metastases")).
            agg(F.min("note_date").alias("history_metastases_min_date")))

        self.tables_to_join["history_hemato"] = (
            self.tables_to_join["note_nlp"].where(
                (col("lexical_variant").rlike("(?i)history"))
                & ((col("lexical_variant").rlike("(?i)myeloma"))
                   | (col("lexical_variant").rlike("(?i)leukemia"))
                   | (col("lexical_variant").rlike("(?i)lymphoma")))
                & (~col("lexical_variant").rlike("(?i)family"))).select(
                    col("note_id")).drop_duplicates().withColumn(
                        "history_hemato", lit(1)).join(
                            self.tables_to_join["note"],
                            how="left",
                            on="note_id").select(col("person_id"),
                                                 col("history_hemato"),
                                                 col("note_date")).groupby(
                                                     col("person_id"),
                                                     col("history_hemato")).
            agg(F.min("note_date").alias("history_hemato_min_date")))

        self.tables_to_join["preceding_surgery"] = (
            self.tables_to_join["visit_detail"].where(
                col("visit_detail_concept_id") == 4149152).select(
                    col("visit_occurrence_id")).withColumn(
                        "from_surgery", lit(1)).drop_duplicates())

        if self.database == "mimicomop":
            self.tables_to_join["icu_stays"] = (
                self.tables_to_join["visit_occurrence"].join(
                    self.tables_to_join["visit_detail"],
                    how="left",
                    on="visit_occurrence_id").where(
                        (col("visit_detail_concept_id") == 32037)
                        & (col("visit_type_concept_id") == 2000000006)).select(
                            col("person_id"), col("visit_occurrence_id"),
                            col("visit_detail_id"),
                            col("visit_detail_concept_id"),
                            col("visit_start_date"), col("visit_end_date"),
                            col("discharge_to_concept_id")))
        else:
            self.tables_to_join["icu_stays"] = (
                self.tables_to_join["visit_occurrence"].join(
                    self.tables_to_join["visit_detail"],
                    how="left",
                    on="visit_occurrence_id").
                join(self.tables_to_join["care_site"].where(
                    F.lower(F.col("care_site_name")).like("% rea %")
                    | F.lower(F.col("care_site_name")).like("%reanimation%")),
                     how="inner",
                     on="care_site_id").select(col("person_id"),
                                               col("visit_occurrence_id"),
                                               col("visit_detail_id"),
                                               col("visit_detail_concept_id"),
                                               col("visit_start_date"),
                                               col("visit_end_date"),
                                               col("discharge_to_concept_id")))

        window_asc = (Window.partitionBy("person_id", "visit_occurrence_id",
                                         "measurement_concept_id").orderBy(
                                             asc("measurement_datetime")))

        window_desc = (Window.partitionBy("person_id", "visit_occurrence_id",
                                          "measurement_concept_id").orderBy(
                                              desc("measurement_datetime")))

        self.pivot_measures_asc = (self.tables_to_join["icu_stays"].join(
            self.tables_to_join["measurement"],
            how="left",
            on="visit_occurrence_id").where(
                (col("measurement_datetime") >= col("visit_start_date"))
                &
                (col("measurement_datetime") <= col("visit_end_date"))).select(
                    '*',
                    rank().over(window_asc).alias('rank')).filter(
                        col('rank') == 1).groupBy("visit_occurrence_id").pivot(
                            "measurement_name",
                            list(self.measurements.keys())).agg(
                                first("value_as_number", ignorenulls=True)))
        self.pivot_measures_asc = self.rename_columns(
            self.pivot_measures_asc,
            {str(k): str("first_" + k)
             for k in self.measurements.keys()})

        self.pivot_measures_desc = (self.tables_to_join["icu_stays"].join(
            self.tables_to_join["measurement"],
            how="left",
            on="visit_occurrence_id").where(
                (col("measurement_datetime") >= col("visit_start_date"))
                &
                (col("measurement_datetime") <= col("visit_end_date"))).select(
                    '*',
                    rank().over(window_desc).alias('rank')).filter(
                        col('rank') == 1).groupBy("visit_occurrence_id").pivot(
                            "measurement_name",
                            list(self.measurements.keys())).agg(
                                first("value_as_number", ignorenulls=True)))
        self.pivot_measures_desc = self.rename_columns(
            self.pivot_measures_desc,
            {str(k): str("last_" + k)
             for k in self.measurements.keys()})

        self.dataset = (self.tables_to_join["icu_stays"].join(
            self.tables_to_join["person"], how="left", on="person_id").join(
                self.tables_to_join["preceding_surgery"],
                how="left",
                on="visit_occurrence_id").withColumn(
                    "from_surgery", coalesce("from_surgery", lit(0))).join(
                        self.tables_to_join["history_aids"],
                        how="left",
                        on="person_id").withColumn(
                            "history_aids",
                            coalesce("history_aids", lit(0))).join(
                                self.tables_to_join["history_metastases"],
                                how="left",
                                on="person_id").withColumn(
                                    "history_metastases",
                                    coalesce("history_metastases", lit(0))).
                        join(self.tables_to_join["history_hemato"],
                             how="left",
                             on="person_id").withColumn(
                                 "history_hemato",
                                 coalesce("history_hemato", lit(0))).join(
                                     self.pivot_measures_asc,
                                     how="left",
                                     on="visit_occurrence_id").join(
                                         self.pivot_measures_desc,
                                         how="left",
                                         on="visit_occurrence_id"))
        self.add_deathdate(sqlContext)
Exemplo n.º 26
0
                        file_date, \
                        month(file_date) as month, \
                        session_id, \
                        pageview_event_cnt, \
                        video_start_cnt, \
                        video_time_spent_secs, \
                        browser_typ_dsc, \
                        device_type_dsc as device \
                FROM user_business_defined_dataset.cnn_adobe_bdd_web \
                WHERE file_date like %s \
                AND (visitor_id != '*UN' OR visitor_id != '-100')" % first_month)
events = events.dropDuplicates(["visitor_id","hit_time_gmt"])
events = events.fillna('desktop', subset=['device'])
events = events.withColumn('device', rename_device(F.col("device")))
events = events.filter(F.col('device')!='other')
window = Window.orderBy(F.col("hit_time_gmt").desc()).partitionBy(["visitor_id", "session_id"])
events = events.withColumn("time_on_event", (F.lag(events.hit_time_gmt, 1).over(window) - events.hit_time_gmt))
events = events.fillna(0.0, subset=['time_on_event'])

#---------------------------------------------------------------------------------------------------------------------------------------------------
# Get traffic features
visitors_1 = events.groupby(["visitor_id","device"]).agg(F.countDistinct("session_id").alias("visits"),
                                                          F.sum("pageview_event_cnt").alias("pageviews"),
                                                          F.sum("video_start_cnt").alias("videostarts"),
                                                          F.sum("time_on_event").alias("time_on_site"), 
                                                          F.sum("video_time_spent_secs").alias("video_playtime"),
                                                          F.first("browser_typ_dsc").alias("browser_typ_dsc"),
                                                          F.countDistinct("file_date").alias("days_visited")).fillna(0.0)

#---------------------------------------------------------------------------------------------------------------------------------------------------
# Split visitors by monthly dropouts vs non-dropouts
Exemplo n.º 27
0
# #### 5.3 - Dataset (parquet) Preview

# In[27]:

get_ipython().run_line_magic('spark.pyspark', '')

df_pqt.show()

# #### 5.4 - Remove duplicated values

# In[29]:

get_ipython().run_line_magic('spark.pyspark', '')

window = Window.partitionBy('id').orderBy(df_pqt['update_date'].desc())
df_pqt = df_pqt.withColumn("last_update", rank().over(window))
df_final = df_pqt.filter("last_update=1")
df_final.show()

# #### 6 - Final dataset storage
#

# In[31]:

get_ipython().run_line_magic('spark.pyspark', '')

file_pqt = 'final.parquet'
df_pqt_path = local + out + file_pqt
df_final.write.parquet(df_pqt_path)
Exemplo n.º 28
0
from pyspark.sql.window import Window


#df_experience = spark.read.format("com.databricks.spark.avro").load("/mnt/S3_final_hour_1/edw.fact_experience_booking/2017/06/10/*/*/", schema=schema_experience)
#df_flight = df_flight.select("booking_id").orderBy('booking_id')
#display(df_flight)

# COMMAND ----------

display(df_flight.select("booking_id","booking_status","modified_at").filter(df_flight.booking_id == '162290951'))

# COMMAND ----------

#use window function row_number to dedup booking.
import pyspark.sql.functions as func
spec = Window.partitionBy(df_flight.booking_id).orderBy(df_flight.modified_at.desc())

row_number = func.row_number().over(spec)

df_flight2 = df_flight.select("booking_id",row_number.alias("row_number"))
df_flight2 = df_flight2.filter(df_flight2.row_number == 1)
display(df_flight2)

# COMMAND ----------

df_experience.groupby(df_experience.booking_status).agg({'booking_id':'count'}).show()

# COMMAND ----------

df_survey.show(2,truncate = True)
Exemplo n.º 29
0
 def growing_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(
         Window.unboundedPreceding, 3)
Exemplo n.º 30
0
 def growing_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(Window.unboundedPreceding, 4)
Exemplo n.º 31
0
 def shrinking_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(
         -2, Window.unboundedFollowing)
Exemplo n.º 32
0
 def shrinking_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(-3, Window.unboundedFollowing)
Exemplo n.º 33
0
def finite_difference(sparkdf,
                      xaxis,
                      yaxes,
                      window_size,
                      partitionAxis=None,
                      monotonically_increasing=False):
    """
    Calculate the finite difference dY/dX for 1 or more Y=f(X) axes with respect to a single X axis
    :param sparkdf: Input Spark dataframe.
    :param xaxis: Column name for X axis.
    :param yaxes: List of column names for Y axes.
    :param window_size: Width of window over which to calculate finite difference (in number of points).
    :param partitionAxis: Categorical axis to partition
    :param monotonically_increasing: Whether Y axes should be monotonically increasing (e.g. counters). If set,
           negative finite differences in Y will be set to zero.
    :return: A Spark dataframe with one new column per Y axis calculated as dY/dX.
    """

    original_columns = sparkdf.schema.fieldNames()

    # Get the first value, so we can use it to define the initial "previous value"
    first_row = sparkdf.first()
    first_x = first_row[xaxis]

    # Slide over this window
    window = Window.orderBy(xaxis)
    if partitionAxis is not None:
        window = window.partitionBy(partitionAxis)

    # Create function to calculate difference between two columns
    delta_fn = udf(lambda col1, col2: col1 - col2, DoubleType())
    div_fn = udf(lambda col1, col2: col1 / col2 if col2 > 0 else float('nan'),
                 DoubleType())

    # Get delta X
    xaxis_lag = xaxis + '_lag_' + str(window_size)
    xaxis_delta = xaxis + '_delta_' + str(window_size)
    df = (sparkdf.withColumn(xaxis, sparkdf[xaxis].cast(
        DoubleType())).withColumn(
            xaxis_lag,
            lag(xaxis, window_size, first_x).over(window)).withColumn(
                xaxis_delta, delta_fn(col(xaxis),
                                      col(xaxis_lag))).drop(xaxis_lag))

    # Get delta y and dY/dX for each y
    for yaxis in yaxes:
        yaxis_lag = yaxis + '_lag_' + str(window_size)
        yaxis_delta = yaxis + '_delta_' + str(window_size)
        rate = 'rate_' + yaxis + '_over_' + xaxis
        rate_lag = rate + '_lag'
        rate_lead = rate + '_lead'

        first_y = first_row[yaxis]

        df = (df.withColumn(yaxis,
                            sparkdf[yaxis].cast(DoubleType())).withColumn(
                                yaxis_lag,
                                lag(yaxis, window_size,
                                    first_y).over(window)).withColumn(
                                        yaxis_delta,
                                        delta_fn(col(yaxis), col(yaxis_lag))))

        if monotonically_increasing:
            df[yaxis_delta] = df[yaxis_delta].map(lambda d: max(0, d))

        df = (
            df.withColumn(rate, div_fn(
                col(yaxis_delta),
                col(xaxis_delta))).drop(yaxis_lag).drop(yaxis_delta)

            # Determine when the delta changes (lead and lag by 1)
            .withColumn(rate_lag,
                        lag(rate, 1, 1).over(window)).withColumn(
                            rate_lead,
                            lead(rate, 1, 1).over(window))

            # Only get values where rate is different from before or after
            .filter(rate + '!=' + rate_lag + ' OR ' + rate + '!=' +
                    rate_lead).drop(rate_lag).drop(rate_lead))

    return df.drop(xaxis_delta)
Exemplo n.º 34
0
    final = sqlContext.sql("SELECT analytic_id" + query +
                           " FROM cust_trans_final GROUP BY analytic_id ")
    final = final.drop('7B20SMS10MMS1D')
    final.registerTempTable("final")

    #filter package according to existing in customer transaction and create index list of packages
    df_index = parsed_master_final.where(
        parsed_master_final.New_Package_Name.isin(column_seq)).drop_duplicates(
            subset=['New_Package_Name']).withColumn(
                "id", monotonically_increasing_id())
    index_pp = df_index.select("id","Package_Duration_XS","Package_Duration_S","Package_Duration_M","Package_Duration_L","Package_Duration_XL","Price_XS","Price_S",\
               "Price_M","Price_L","Price_XL","Package_Size_1Mbps","Package_Size_256Kbps","Package_Size_2Mbps","Package_Size_384Kbps",\
               "Package_Size_4Mbps","Package_Size_512Kbps","Package_Size_64Kbps","Package_Size_6Mbps","Package_Size_Entertain",\
               "Package_Size_Full speed","Package_Size_Game","Package_Size_Social","Package_Size_Time")

    windowSpec = W.orderBy("id")
    ontop_rowid = df_index.withColumn("id", row_number().over(windowSpec))
    #df_index.select("New_Package_Name","id").withColumn("id", row_number().over(windowSpec)).show()
    ontop_rowid2 = ontop_rowid.withColumnRenamed("Package_Size_Full speed",
                                                 "Package_Size_Full_speed")
    ontop_rowid2.registerTempTable("ontop_rowid2")
    index_pp2 = sqlContext.sql(
        "SELECT id-1 as id,Package_Duration_XS,Package_Duration_S,Package_Duration_M,Package_Duration_L,Package_Duration_XL,Price_XS,Price_S,Price_M,Price_L,Price_XL,Package_Size_1Mbps,Package_Size_256Kbps,Package_Size_2Mbps,Package_Size_384Kbps,Package_Size_4Mbps,Package_Size_512Kbps,Package_Size_64Kbps,Package_Size_6Mbps,Package_Size_Entertain,Package_Size_Full_speed,Package_Size_Game,Package_Size_Social,Package_Size_Time FROM ontop_rowid2"
    )
    package_sort = ontop_rowid.select('New_Package_Name').rdd.map(
        lambda r: r[0]).collect()
    query = ''
    for c in package_sort:
        query = query + ", " + c
    final_selected = sqlContext.sql("SELECT analytic_id" + query +
                                    " FROM final")
Exemplo n.º 35
0
 def growing_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)
    data = spark.read.format('csv') \
                    .options(header='true', inferschema='false') \
                    .load(sys.argv[1])
    data = data.select('Symbol',
                        'Name',
                        'Date',
                        'Log Return') \
                .filter(col('Log Return') != '')
    data = data.withColumn(
        'Date',
        to_date(unix_timestamp(col('Date'), 'yyyy-MM-dd').cast('timestamp')))
    data = data.withColumn('Log Return',
                           col('Log Return').cast('float').alias('Log Return'))

    # cumulative log return sum
    cum_window = Window.partitionBy(data['Symbol']).orderBy(
        data['Date']).rangeBetween(Window.unboundedPreceding, 0)
    cum_sum = data.withColumn('CumulativeSum',
                              sum('Log Return').over(cum_window))
    cum_sum = cum_sum.orderBy(col('Symbol'), col('Date'))

    # total log return sum
    sum_window = Window.partitionBy('Symbol')
    sp_sum = cum_sum.withColumn('MaxDate', max('Date').over(sum_window)) \
                    .where(col('Date') == col('MaxDate')) \
                    .drop('MaxDate', 'Log Return', 'Date') \
                    .withColumnRenamed('CumulativeSum', 'ReturnSum')
    sp_sum = sp_sum.select('Symbol', 'Name', 'ReturnSum')

    # top 5 drops
    top_drop_window = Window.partitionBy(data['Symbol']).orderBy(
        data['Log Return'].asc())
Exemplo n.º 37
0
 def shrinking_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
Exemplo n.º 38
0
# Ponto de entrada para gerar uma sessão do spark
spark = SparkSession.builder.appName("PySpark Test").getOrCreate()

# ==> Lendo de um arquivo CSV(local) sem a necessidade de gerar um schema
# ==> df = spark.read.csv(r'./resources/data.csv',header=True)

# Criando um DataFrame passando os valores do array e o schema definido
df = spark.createDataFrame(data=data, schema=schema)

# Criando um novo DataFrame que irá incluir uma nova coluna,
# convertendo campo String em date
# e excluindo a coluna de data(origem em string)
df2 = df.withColumn(
    'Atualizacao',
    F.to_date(
        F.unix_timestamp(
            F.col('Data_Atualizacao'),
            'yyyy-MM-dd').cast("timestamp"))).drop('Data_Atualizacao')

# Criando o Dataframe final adcionando uma nova coluna
#  que fará o rank ordenando pela data de atualização,
#  agrupado por municipio e ordenado
df_final = df2.withColumn(
    "Ordem Transacao",
    F.dense_rank().over(
        W.partitionBy("Municipio").orderBy("Atualizacao"))).orderBy(
            "Municipio", "Ordem Transacao")

# Exibindo resultado
df_final.show(truncate=False)
Exemplo n.º 39
0
 def sliding_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
Exemplo n.º 40
0
from pyspark.sql.window import Window
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('Window Function').getOrCreate()

data = [("sales", 1, 5000), ("personnel", 2, 3900), ("sales", 3, 4800),
        ("sales", 4, 4800), ("personnel", 5, 3500), ("develop", 7, 4200),
        ("develop", 8, 6000), ("develop", 9, 4500), ("develop", 10, 5200),
        ("develop", 11, 5200)]

columns = ['dept', 'empno', 'salary']
emp_df = spark.createDataFrame(data=data, schema=columns)

print(emp_df.show())

window_spec = Window.partitionBy("dept").orderBy("salary")
cum_salary_by_dept = emp_df\
  .withColumn("max_salary", sum("salary").over(window_spec))
print(max_salary_by_dept.show())

emp_rank_by_dept = emp_df\
  .withColumn("rank", rank().over(Window.partitionBy("dept").orderBy("salary")))\
  .withColumn("dense_rank", dense_rank().over(Window.partitionBy("dept").orderBy("salary")))\
  .withColumn("row_number", row_number().over(Window.partitionBy("dept").orderBy("salary"))) \
  .withColumn("max_salary", max("salary").over(Window.partitionBy("dept")))\
  .withColumn("min_salary", min("salary").over(Window.partitionBy("dept")))
print(emp_rank_by_dept.show())

print(
    emp_rank_by_dept.select("dept", "min_salary",
                            "max_salary").dropDuplicates().show())
Exemplo n.º 41
0
def cal_mat_window(sc, sqlContext, dfSC, window):
    windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1)
    mat = func.avg("close").over(windowSpec)
    dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat )
    print dfSC.collect()
Exemplo n.º 42
0
    def churners(self, spark, mysqldetails, InsightName, memberDF, tagsDict):
        getset = GetSet()
        pw = getset.get_predictorwindow()
        mydate = getset.get_churnmaxdate()

        #churn_data_semifinal1=pd.read_csv('final_data.csv',low_memory=False)

        query = "(SELECT txnmappedmobile as Mobile,modifiedbillno,uniqueitemcode,itemqty FROM sku_report_loyalty where ModifiedTxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str(
            pw) + " DAY)) as df"
        jdbcDFsku = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])
        jdbcDFsku = jdbcDFsku.groupBy('Mobile', 'modifiedbillno').agg(
            countDistinct('uniqueitemcode').alias('uniqueitemcode'),
            sum('itemqty').alias('itemqty'))
        jdbcDFsku = jdbcDFsku.groupBy('Mobile').agg(
            sum('uniqueitemcode').alias('uniqueitems'),
            sum('itemqty').alias('qty'))

        query = "(SELECT Mobile, tier, gender,EnrolledStoreCode FROM member_report) as df"
        jdbcDFmem = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])

        query = "(select StoreCode,state,city from store_master) as df"
        jdbcDFstr = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])

        jdbcDFmem = jdbcDFmem.alias('a').join(
            jdbcDFstr.alias('b'),
            col('a.EnrolledStoreCode') == col('b.StoreCode'), 'left')

        query = "(SELECT TxnMappedMobile as Mobile,modifiedbillno,departmentcode FROM (SELECT a.uniqueitemcode,a.departmentcode,b.modifiedbillno,b.TxnMappedMobile FROM item_master a inner join sku_report_loyalty b on a.uniqueitemcode=b.uniqueitemcode where b.ModifiedTxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str(
            pw) + " DAY)) t ) as df"
        jdbcDFdep = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])
        jdbcDFdep = jdbcDFdep.groupBy('Mobile', 'modifiedbillno').agg(
            countDistinct('departmentcode').alias('departmentcode'))
        jdbcDFdep = jdbcDFdep.groupBy('Mobile').agg(
            sum('departmentcode').alias('ndept'))

        #query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate<DATE_SUB('2019-04-01',INTERVAL "+cw+" DAY) and TxnDate>DATE_SUB('2019-04-01',INTERVAL "+lw+" DAY))) as df"
        query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str(
            pw) + " DAY)) as df"

        jdbcDF3 = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                           mysqldetails[1], query,
                                           mysqldetails[2], mysqldetails[3])
        jdbcDF3.show(5)
        #jdbcDF3c = jdbcDF3.filter(col('TxnDate')>date_sub(lit(datetime(2019,4,1).date()),lit(cw)))
        #mlist = jdbcDF3c.rdd.map(lambda r: r.Mobile).collect()
        #jdbcDF3 = jdbcDF3.withColumn('labels',when(jdbcDF3.Mobile.isin(mlist),0).otherwise(1))

        window = Window.partitionBy('Mobile').orderBy("TxnDate")
        jdbcDF3 = jdbcDF3.withColumn(
            "diff", datediff(lag('TxnDate', 1).over(window), 'TxnDate'))

        #jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),\
        #max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'),max('labels').alias('labels'))
        jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),sum('LapsedPoints').alias('lapsedpoints'),sum('AvailablePoints').alias('availablepoints'),\
        max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'),sum('PointsCollected').alias('pointscollected'))
        jdbcDF3 = jdbcDF3.withColumn('current_date',
                                     datetime.strptime(mydate, "%Y-%m-%d"))
        jdbcDF3 = jdbcDF3.withColumn(
            "recency", datediff(col("MaxDate"), col("current_date")))
        jdbcDF3 = jdbcDF3.withColumn(
            "Tenure", datediff(col("MinDate"), col("current_date")))
        jdbcDF3 = jdbcDF3.withColumn(
            "latency",
            datediff(col("MinDate"), col("MaxDate")) /
            (col('total_visits') - 1))

        jdbcDF3 = jdbcDF3.join(jdbcDFsku, 'Mobile', 'left')
        jdbcDF3 = jdbcDF3.join(jdbcDFdep, 'Mobile', 'left')
        jdbcDF3 = jdbcDF3.join(jdbcDFmem, 'Mobile', 'left')

        jdbcDF3 = jdbcDF3.toDF(*[c.lower() for c in jdbcDF3.columns])
        jdbcDF3 = jdbcDF3.select('mobile', 'gender', 'tier', 'state', 'city',
                                 'total_txn', 'total_visits', 'total_amount',
                                 'pointscollected', 'lapsedpoints',
                                 'availablepoints', 'max_latency', 'recency',
                                 'tenure', 'latency')
        jdbcDF3.write.option(
            "header",
            "true").mode("overwrite").csv('./' + mysqldetails[1] +
                                          '/churn_data_semifinal1.csv')
        '''
        myvars = jdbcDF3.columns
        modelvars = ['mobile', 'gender', 'tier', 'state', 'city', 'total_txn',
               'total_visits', 'total_amount', 'pointscollected', 'lapsedpoints',
               'availablepoints', 'max_latency', 'recency', 'tenure', 'latency',
               'labels'] 
        columns = list(set(myvars).intersection(set(modelvars)))
        '''
        columns = jdbcDF3.columns
        data_predict = probchurn.data_process(self, spark, columns)
        #probchurn.classify(self,mysqldetails,'logistic_regression',churn_data_semifinal1)
        mlist = memberDF.rdd.map(lambda r: r.mobile).collect()
        churn_data_semifinal1 = churn_data_semifinal1.Mobile.isin(mlist)
        method = 'lightgbm'
        memberDF = probchurn.predict(self, mysqldetails, method, data_predict)
        return memberDF
Exemplo n.º 43
0
 def unbounded_window(self):
     return Window.partitionBy('id') \
         .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
Exemplo n.º 44
0
    def modelfit(self, spark, mysqldetails):
        getset = GetSet()
        pw = getset.get_predictorwindow()
        cw = getset.get_churnwindow()
        lw = int(pw) + int(cw)
        mydate = getset.get_churnmaxdate()
        query = "(SELECT TxnMappedMobile as Mobile,modifiedbillno,uniqueitemcode,itemqty FROM sku_report_loyalty where TxnDate<DATE_SUB('" + mydate + "',INTERVAL " + str(
            cw
        ) + " DAY) and TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str(
            lw) + " DAY)) as df"
        jdbcDFsku = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])
        jdbcDFsku = jdbcDFsku.groupBy('Mobile', 'modifiedbillno').agg(
            countDistinct('uniqueitemcode').alias('uniqueitemcode'),
            sum('itemqty').alias('itemqty'))
        jdbcDFsku = jdbcDFsku.groupBy('Mobile').agg(
            sum('uniqueitemcode').alias('uniqueitems'),
            sum('itemqty').alias('qty'))

        query = "(SELECT Mobile, tier, gender,EnrolledStoreCode FROM member_report) as df"
        jdbcDFmem = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])

        query = "(select StoreCode,state,city from store_master) as df"
        jdbcDFstr = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])

        jdbcDFmem = jdbcDFmem.alias('a').join(
            jdbcDFstr.alias('b'),
            col('a.EnrolledStoreCode') == col('b.StoreCode'), 'left')

        query = "(SELECT TxnMappedMobile as Mobile,modifiedbillno,departmentcode FROM (SELECT a.uniqueitemcode,a.departmentcode,b.modifiedbillno,b.TxnMappedMobile FROM item_master a inner join sku_report_loyalty b on a.uniqueitemcode=b.uniqueitemcode where b.TxnDate<DATE_SUB('" + mydate + "',INTERVAL " + str(
            cw
        ) + " DAY) and b.TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str(
            lw) + " DAY)) t ) as df"
        jdbcDFdep = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                             mysqldetails[1], query,
                                             mysqldetails[2], mysqldetails[3])
        jdbcDFdep = jdbcDFdep.groupBy('Mobile', 'modifiedbillno').agg(
            countDistinct('departmentcode').alias('departmentcode'))
        jdbcDFdep = jdbcDFdep.groupBy('Mobile').agg(
            sum('departmentcode').alias('ndept'))

        #query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate<DATE_SUB('2019-04-01',INTERVAL "+cw+" DAY) and TxnDate>DATE_SUB('2019-04-01',INTERVAL "+lw+" DAY))) as df"
        query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str(
            lw) + " DAY)) as df"

        jdbcDF3 = MySqlConnector.mysqlRead(self, spark, mysqldetails[0],
                                           mysqldetails[1], query,
                                           mysqldetails[2], mysqldetails[3])
        #jdbcDF3.show(5)
        jdbcDF3c = jdbcDF3.filter(
            col('TxnDate') > datetime.strptime(mydate, "%Y-%m-%d") -
            timedelta(days=int(cw)))
        mlist = jdbcDF3c.rdd.map(lambda r: r.Mobile).collect()
        print("now", datetime.strptime(mydate, "%Y-%m-%d"))
        print("churn window start",
              datetime.strptime(mydate, "%Y-%m-%d") - timedelta(days=int(cw)))
        jdbcDF3 = jdbcDF3.filter(
            col('TxnDate') < datetime.strptime(mydate, "%Y-%m-%d") -
            timedelta(days=int(cw)))
        jdbcDF3 = jdbcDF3.withColumn(
            'labels',
            when(jdbcDF3.Mobile.isin(mlist), 0).otherwise(1))
        window = Window.partitionBy('Mobile').orderBy("TxnDate")
        jdbcDF3 = jdbcDF3.withColumn(
            "diff", datediff(lag('TxnDate', 1).over(window), 'TxnDate'))
        jdbcDF3.show(5)
        jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),sum('LapsedPoints').alias('lapsedpoints'),sum('AvailablePoints').alias('availablepoints'),\
        max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'),sum('PointsCollected').alias('pointscollected'),max('labels').alias('labels'))
        #jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),\
        #max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'))
        jdbcDF3 = jdbcDF3.withColumn('current_date',
                                     date_sub('MaxDate', int(cw)))
        jdbcDF3 = jdbcDF3.withColumn(
            "recency", datediff(col("MaxDate"), col("current_date")))
        jdbcDF3 = jdbcDF3.withColumn(
            "Tenure", datediff(col("MinDate"), col("current_date")))
        jdbcDF3 = jdbcDF3.withColumn(
            "latency",
            datediff(col("MinDate"), col("MaxDate")) /
            (col('total_visits') - 1))

        jdbcDF3 = jdbcDF3.join(jdbcDFsku, 'Mobile', 'left')
        jdbcDF3 = jdbcDF3.join(jdbcDFdep, 'Mobile', 'left')
        jdbcDF3 = jdbcDF3.join(jdbcDFmem, 'Mobile', 'left')

        jdbcDF3 = jdbcDF3.toDF(*[c.lower() for c in jdbcDF3.columns])
        #row = jdbcDF3.limit(1).collect()[0].asDict()
        #myvars = [i for i,j in row.items()]
        '''
        myvars = jdbcDF3.columns
        modelvars = ['mobile', 'gender', 'tier', 'state', 'city', 'total_txn',
               'total_visits', 'total_amount', 'pointscollected', 'lapsedpoints',
               'availablepoints', 'max_latency', 'recency', 'tenure', 'latency',
               'labels'] 
        columns = list(set(myvars).intersection(set(modelvars)))
        '''
        columns = jdbcDF3.columns
        jdbcDF3 = jdbcDF3.select(*columns)
        os.system('mkdir -p ' + mysqldetails[1])
        jdbcDF3.write.option(
            "header",
            "true").mode("overwrite").csv('./' + mysqldetails[1] +
                                          '/churn_data_semifinal1.csv')
        #jdbcDF3.write.format('com.databricks.spark.csv').save('churn_data_semifinal1.csv',header= 'true')
        data = probchurn.data_process(self, spark, columns)
        probchurn.classify(self, mysqldetails, 'lightgbm', data)
        print("churn modelling done!")
Exemplo n.º 45
0
 def unpartitioned_window(self):
     return Window.partitionBy()
############################################################
### Temporal split
############################################################

## create train and test dictionaries
train = {}
test = {}

# import pyspark.sql functions
from pyspark.sql.functions import lit, col, row_number
from pyspark.sql.window import Window

# select date column to sort
w = Window.orderBy('clean_date')

## 50/50 split
for i in data_array:

    # store middle question count
    hlfwy_mrk = datasets[i].count() / 2

    # add index column with sorted data
    datasets[i] = datasets[i].withColumn('index', row_number().over(w))
    train[i] = datasets[i].filter(datasets[i]['index'] <= lit(hlfwy_mrk))
    test[i] = datasets[i].filter(datasets[i]['index'] > lit(hlfwy_mrk))
    numer = test[i].count()
    denom = numer + train[i].count()
    frac = numer / denom
    print(f'{i}: {frac}, from {numer} test and {denom} total')
Exemplo n.º 47
0
# substract the time with 870  to get the minute indicator
trade_df = trade_df.withColumn(
    'MinuteIndicator', trade_df['Hour'] * 60 + trade_df['Minute'] - 870)
# select the data during trading hours, trading volume larger than 0 and drop any missing values on trade price
trade_df = trade_df.\
    filter((trade_df['MinuteIndicator'] >= 0) & (trade_df['MinuteIndicator'] <= 390) & (trade_df['Volume']>0)).\
    dropna(subset=('Price'))
# delete rows with ticker contains '![/', all these rows are minute-by-minute summary rather than real trade
trade_df = trade_df.filter(~trade_df['Ticker'].contains('![/'))
# drop duplicates minutes for each stock and
trade_4_computing_ret = trade_df.groupBy(['Ticker', 'MinuteIndicator']).max('SeqNo')\
                        .withColumnRenamed('max(SeqNo)', 'SeqNo')\
                        .join(trade_df, ['Ticker', 'SeqNo', 'MinuteIndicator'])\
                        .orderBy(['Ticker', 'MinuteIndicator'])\
                        .select('Ticker', 'MinuteIndicator', 'Price')
windowSpec = Window.partitionBy('Ticker').orderBy('MinuteIndicator')
trade_4_computing_ret = trade_4_computing_ret.withColumn(
    'Lag Price',
    lag(trade_4_computing_ret['Price']).over(windowSpec))
trade_4_computing_ret = trade_4_computing_ret.withColumn('Ret',
                                                         log(trade_4_computing_ret['Price']/trade_4_computing_ret['Lag Price']))\
                                                         .dropna(subset=('Ret'))\
                                                         .orderBy('Ticker', 'MinuteIndicator')
final_df = pd.DataFrame(
    trade_4_computing_ret.collect(),
    columns=['Ticker', 'MinuteIndicator', 'Price', 'Lag Price', 'Ret'])
new = pd.DataFrame(final_df.Ticker.unique(), [range(1, 391)]*len(final_df.Ticker.unique()))\
                   .reset_index()\
                   .explode('index').rename(columns={'index': 'MinuteIndicator', 0: 'Ticker'})
df_for_plot = new.merge(final_df[['Ticker', 'MinuteIndicator', 'Ret']], how='left', on=['Ticker', 'MinuteIndicator'])\
                        .fillna(0)\
def collect_numeric_metric(metric, df, population):
    cdf = df.select(df[metric['src']])
    cdf = cdf.dropna(subset=metric['src'])
    cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket'))

    total_count = cdf.count()
    num_partitions = total_count / 500
    ws = Window.orderBy('bucket')
    cdf = cdf.select(
        cdf['bucket'],
        cume_dist().over(ws).alias('c'),
        row_number().over(ws).alias('i'))
    cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions)
    cdf = cdf.collect()

    # Collapse rows with duplicate buckets.
    collapsed_data = []
    prev = None
    for d in cdf:
        if not collapsed_data:
            collapsed_data.append(d)  # Always keep first record.
            continue
        if prev and prev['bucket'] == d['bucket']:
            collapsed_data.pop()
        collapsed_data.append(d)
        prev = d

    # Calculate `p` from `c`.
    data = []
    prev = None
    for i, d in enumerate(collapsed_data):
        p = d['c'] - prev['c'] if prev else d['c']
        data.append({
            'bucket': d['bucket'],
            'c': d['c'],
            'p': p,
        })
        prev = d
    """
    Example of what `data` looks like now::

        [{'bucket': 0.0,        'c': 0.00126056, 'p': 0.00126056},
         {'bucket': 3.0,        'c': 0.00372313, 'p': 0.00246256},
         {'bucket': 4.0,        'c': 0.00430616, 'p': 0.0005830290622683026},
         {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184},
         {'bucket': 8.0,        'c': 0.08114486, 'p': 0.07514685},
         {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795},
         ...]
    """
    # Push data to database.
    sql = ("INSERT INTO api_numericcollection "
           "(num_observations, population, metric_id, dataset_id) "
           "VALUES (%s, %s, %s, %s) "
           "RETURNING id")
    params = [total_count, population, metric['id'], dataset_id]
    if DEBUG_SQL:
        collection_id = 0
        print sql, params
    else:
        cursor.execute(sql, params)
        conn.commit()
        collection_id = cursor.fetchone()[0]

    for d in data:
        sql = ("INSERT INTO api_numericpoint "
               "(bucket, proportion, collection_id) "
               "VALUES (%s, %s, %s)")
        params = [d['bucket'], d['p'], collection_id]
        if DEBUG_SQL:
            print sql, params
        else:
            cursor.execute(sql, params)

    if not DEBUG_SQL:
        conn.commit()