示例#1
0
    def prepare_df(df):
        num_rows = df.count()

        # Expand dates.
        df = expand_date(df)

        df = df \
            .withColumn('Open', df.Open != '0') \
            .withColumn('Promo', df.Promo != '0') \
            .withColumn('StateHoliday', df.StateHoliday != '0') \
            .withColumn('SchoolHoliday', df.SchoolHoliday != '0')

        # Merge in store information.
        store = store_csv.join(store_states_csv, 'Store')
        df = df.join(store, 'Store')

        # Merge in Google Trend information.
        google_trend_all = prepare_google_trend()
        df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend)

        # Merge in Google Trend for whole Germany.
        google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE']
        google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de')
        df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de)

        # Merge in weather.
        weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName)
        df = df.join(weather, ['State', 'Date'])

        # Fix null values.
        df = df \
            .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \
            .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \
            .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \
            .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1)))

        # Days & months competition was open, cap to 2 years.
        df = df.withColumn('CompetitionOpenSince',
                           F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear,
                                                     df.CompetitionOpenSinceMonth)))
        df = df.withColumn('CompetitionDaysOpen',
                           F.when(df.CompetitionOpenSinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince))))
                           .otherwise(0))
        df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

        # Days & weeks of promotion, cap to 25 weeks.
        df = df.withColumn('Promo2Since',
                           F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'))
        df = df.withColumn('Promo2Days',
                           F.when(df.Promo2SinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))))
                           .otherwise(0))
        df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType()))

        # Check that we did not lose any rows through inner joins.
        assert num_rows == df.count(), 'lost rows in joins'
        return df
示例#2
0
    def count_time_since(self, col_for_time_since, time_since_date):
        '''Columns to be passed for calculating duration need to be in TimeStamped format'''
        '''time_since_date should be in dd/MM/yyyy format'''
        # print "COUNT TIME SINCE - "
        # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
        self._data_frame = self._data_frame.withColumn(
            col_for_time_since + '_temp', self.to_date_(col_for_time_since))
        uniqueVals = self._data_frame.select(
            col_for_time_since +
            '_temp').distinct().na.drop().limit(1000).collect()
        try:
            date_format = self._metaHelperInstance.get_datetime_format(
                uniqueVals)
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE", F.lit(time_since_date))
            to_date_udf = udf(
                lambda x: datetime.strptime(x, date_format)
                if x != None else x, DateType())
            self._data_frame = self._data_frame.withColumn(
                col_for_time_since + '_temp',
                to_date_udf(col(col_for_time_since + '_temp')))
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE(Timestamped)",
                to_timestamp(self._data_frame["TIME_SINCE_DATE"],
                             "dd/MM/yyyy"))
            self._data_frame = self._data_frame.withColumn(
                col_for_time_since + "_time_since",
                datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"],
                         self._data_frame[col_for_time_since + '_temp']))
            self._data_frame = self._data_frame.drop(
                "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)")
        except TypeError:
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE", F.lit(time_since_date))
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE(Timestamped)",
                to_timestamp(self._data_frame["TIME_SINCE_DATE"],
                             "dd/MM/yyyy"))
            self._data_frame = self._data_frame.withColumn(
                col_for_time_since + "_time_since",
                datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"],
                         self._data_frame[col_for_time_since + '_temp']))
            self._data_frame = self._data_frame.drop(
                "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)")

        # self._data_frame = self._data_frame.withColumn(col_for_time_since, to_timestamp(self._data_frame[col_for_time_since], "dd/MM/yyyy").alias(col_for_time_since))
        # self._data_frame = self._data_frame.withColumn(col_for_time_since, F.from_unixtime(F.unix_timestamp(self._data_frame[col_for_time_since]), "dd/MM/yyyy").alias(col_for_time_since))
        self._data_frame = self._data_frame.drop(col_for_time_since + '_temp')
        return self._data_frame
示例#3
0
def clean_data(df):
    get_people_type = udf(people_type, IntegerType())
    get_trip_type = udf(trip_type, IntegerType())
    compare = udf(lambda x, y: int(x != y), IntegerType())
    greater_than_zero = udf(lambda x: max(x, 0))

    df = df.na.fill({'orig_destination_distance': 0})

    df = df.withColumn(
        'people_type', get_people_type(df.srch_children_cnt,
                                       df.srch_adults_cnt))
    # df.drop(['srch_children_cnt','srch_adults_cnt'], axis=1, inplace=True)

    # df.drop(['plan_days','travel_days'], axis=1, inplace=True)
    df = df.withColumn('foreign',
                       compare(df.user_location_country, df.hotel_country))
    df = df.withColumn('diff_conti',
                       compare(df.posa_continent, df.hotel_continent))
    df = df.withColumn('plan_hour', hour(df.date_time))

    df1 = df.filter(df.is_booking == 1)

    df1 = df1.na.drop()

    df1 = df1.withColumn(
        'plan_days', greater_than_zero(datediff(df1.srch_ci, df1.date_time)))
    df1 = df1.withColumn('travel_days',
                         greater_than_zero(datediff(df1.srch_co, df1.srch_ci)))
    df1 = df1.withColumn('travel_month', month(df1.srch_ci))
    df1 = df1.withColumn('price_multiplier', df1.travel_days * df1.srch_rm_cnt)
    df1 = df1.withColumn('trip_type',
                         get_trip_type(df1.plan_days, df1.travel_days))

    df1 = df1.select([
        c for c in df1.columns
        if c not in ['is_booking', 'srch_ci', 'srch_co', 'date_time']
    ])

    df2 = df.filter(df.is_booking == 0)
    df2 = df2.select([
        c for c in df2.columns
        if c not in ['is_booking', 'srch_ci', 'srch_co', 'date_time']
    ])
    df2 = df2.na.drop()

    hotel_prob(df1, PATH + 'hotel_destinations_all1.csv')
    hotel_prob(df2, PATH + 'hotel_destinations_all2.csv')

    return df1, df2
示例#4
0
 def process_for_counts_and_ageing_days(self, filtered_df):
     mapping_expr = F.create_map(
         [F.lit(x) for x in chain(*AGEING_DAY_WEIGHT_MAP.items())])
     count_segregation_df = filtered_df.withColumn("SuccessFlag",
                                                   F.when(filtered_df.CallStatus == 'Success', 1).otherwise(0))\
         .withColumn("DropFlag", F.when(filtered_df.CallStatus == 'Drop', 1).otherwise(0))\
         .withColumn("Age", F.datediff(F.lit(datetime.datetime.today().date()), F.col("CallDate")))\
         .withColumn("AgeScore", mapping_expr.getItem(F.datediff(F.lit(datetime.datetime.today().date()),
                                                                 F.col("CallDate")))).\
         groupBy(F.col("MSISDN"), F.col("CallDate"), F.col("CellId"), F.col('Age'), F.col('AgeScore')).\
         agg(F.sum('SuccessFlag').alias('SuccessCount'), F.sum('DropFlag').alias('DropCount'),
             F.sum(F.col('SuccessFlag') + F.col('DropFlag')).alias("CellCallCount"),
             (F.sum('DropFlag') / F.sum(F.col('SuccessFlag')+F.col('DropFlag'))).alias('DropRatio')).\
         orderBy(F.col('CellId'), ascending=True)
     return count_segregation_df
示例#5
0
def experiment_pings_df(pings_df, membership_df, observation_period):
    """
    get the subset of pings that are:
        1: in the membership df (by client_id)
        2: activity date is after enrollment
           (at least the day after) and before end of observation period

    note: if the pings_df has a branch column, join on (client_id, branch)
    """
    # if observation period is negative, get the days preceding
    # if positive, get days following
    top = observation_period
    bottom = 0
    if observation_period < 0:
        top = 0
        bottom = observation_period

    # rename column so joining is easier
    pings_df = pings_df.withColumn('client_id_pings',
                                   F.col('client_id')).drop('client_id')

    # if branch exists, join on client_id, branch, date range
    if 'branch' in pings_df.columns:
        pings_df = pings_df.withColumn('branch_pings',
                                       F.col('branch')).drop('branch')
        df = membership_df.join(
            pings_df,
            (F.col('client_id') == F.col('client_id_pings'))
            & (F.col('branch') == F.col('branch_pings'))
            & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) < top)
            & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) >
               bottom),  # no enroll day pings
            how='left')
        df = df.drop('branch_pings')

    else:
        df = membership_df.join(
            pings_df,
            (F.col('client_id') == F.col('client_id_pings'))
            & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) < top)
            & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) >
               bottom),  # no enroll day pings
            how='left')

    # clean up columns
    df = df.drop('client_id_pings')

    return df
示例#6
0
def glean_2(invoice_df, line_item_df):
    invoice = invoice_df.alias('invoice')
    line_item = line_item_df.alias('line_item')
    joined_table = invoice.join(line_item,
                                invoice.invoice_id == line_item.invoice_id,
                                how='left')
    glean2 = joined_table.groupBy(
        invoice.invoice_id, invoice.canonical_vendor_id, invoice.invoice_date,
        invoice.period_end_date).agg(
            max_('line_item.period_end_date').alias('max_line_end_date'))
    glean2 = glean2.withColumn(
        'end_date', funcs.greatest('max_line_end_date',
                                   invoice.period_end_date))
    glean2 = glean2.withColumn('difference',
                               funcs.datediff('end_date', 'invoice_date'))
    glean2 = glean2[glean2['difference'] > 90]
    glean2 = glean2.withColumn('glean_location', funcs.lit('invoice'))
    glean2 = glean2.withColumn("glean_type", funcs.lit('accrual_alert'))
    glean2 = glean2.withColumn(
        "glean_text",
        funcs.concat(
            funcs.lit('Line items from vendor '),
            funcs.col('canonical_vendor_id'),
            funcs.lit(' in this invoice cover future periods (through '),
            funcs.col('end_date'), funcs.lit(' )')))
    glean2 = glean2.withColumn('glean_date', funcs.col('invoice_date'))
    return glean2
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        ds = process_manager.service(DataStore)

        # --- your algorithm code goes here
        self.logger.debug('Now executing link: {link}.', link=self.name)

        df = ds[self.read_key]

        window = Window.orderBy(self.datetime_col) \
            .rowsBetween(0, Window.unboundedFollowing)

        if self.partitionby_cols:
            window = window.partitionBy(self.partitionby_cols)

        date_col = f.to_date(self.datetime_col)  # only dates per row
        reset_col = f.when(f.col(self.event_col) > 0,
                           date_col)  # dates of events
        bfilled_reset_col = f.first(reset_col, ignorenulls=True).over(
            window)  # backfilled dates of events
        countdown_col = f.datediff(
            bfilled_reset_col,
            date_col)  # difference between next event date and today

        ds[self.store_key] = df.withColumn(self.countdown_col_name,
                                           countdown_col)

        return StatusCode.Success
示例#8
0
 def generate_age_information(self, dcc_experiment_df: DataFrame,
                              mice_df: DataFrame):
     experiment_df_a = dcc_experiment_df.alias("exp")
     mice_df_a = mice_df.alias("mice")
     dcc_experiment_df = experiment_df_a.join(
         mice_df_a,
         (experiment_df_a["specimenID"] == mice_df["_specimenID"])
         & (experiment_df_a["_centreID"] == mice_df["_centreID"]),
         "left_outer",
     )
     dcc_experiment_df = self._add_special_dates(dcc_experiment_df)
     dcc_experiment_df = dcc_experiment_df.withColumn(
         "ageInDays",
         datediff(
             when(
                 col("_dateOfBloodCollection").isNotNull(),
                 col("_dateOfBloodCollection"),
             ).when(
                 col("_dateOfSacrifice").isNotNull(),
                 col("_dateOfSacrifice")).otherwise(
                     col("_dateOfExperiment")),
             col("mice._DOB"),
         ),
     )
     dcc_experiment_df = dcc_experiment_df.withColumn(
         "ageInWeeks",
         udf(lambda x: math.floor(x / 7) if x is not None else None,
             IntegerType())(col("ageInDays")),
     )
     return dcc_experiment_df.select("exp.*", "ageInWeeks", "ageInDays")
示例#9
0
    def transform_immigration(self, immigration_input_path: str,
                              immigration_output_path: str,
                              date_output_path: str) -> bool:

        # read immigration data
        imgDF = self._read_data(immigration_input_path, 'immigration')
        # transform
        print('Starting transform immigration dataset ...')
        timer = time.time()

        # clean demographic
        imgDF = self._cleaner.clean_immigration_data(imgDF)

        # extract dim dates
        self._extract_dim_date(imgDF, date_output_path)

        imgDF = imgDF.withColumn(
            'stay_days',
            F.datediff(F.col('allowed_stay_date'), F.col('arrival_date')))

        print(
            f'Finished transform immigration dataset in: {(time.time() - timer):06.2f} sec.'
        )

        print_data_info(imgDF, 'immigrationDF')

        # write
        self._write_data(imgDF, immigration_output_path, 'immigration')

        return True
示例#10
0
文件: spark.py 项目: TrololoLi/Spark
def process(spark, input_file, target_path):

    df = spark.read.parquet(input_file)

    df = df.withColumn('click', F.when(F.col('event')=='click',1).otherwise(0))\
            .withColumn('view', F.when(F.col('ad_cost_type')=='view',1).otherwise(0))\
            .withColumn('is_cpm', F.when(F.col('ad_cost_type')=='CPM',1).otherwise(0))\
            .withColumn('is_cpc', F.when(F.col('ad_cost_type')=='CPC',1).otherwise(0))


    df_ctr = df.groupBy('ad_id')\
                .agg(F.sum('click').alias('num_of_clicks'),F.sum('view').alias('num_of_views'),F.min('date').alias('min'),F.max('date').alias('max'))\
                .withColumn('CTR', (col('num_of_clicks')/col('num_of_views')))\
                .withColumn('day_count',F.datediff(col('max'),col('min')))

    df = df.drop('date', 'time', 'event', 'platform', 'client_union_id',
                 'compaign_union_id', 'ad_cost_type', 'click',
                 'view').distinct()

    final_data = df_ctr.join(df, ['ad_id'],
                             'right').select('ad_id', 'target_audience_count',
                                             'has_video', 'is_cpm', 'is_cpc',
                                             'ad_cost', 'day_count', 'CTR')

    a, b, c, = final_data.randomSplit([0.5, 0.25, 0.25])

    a.coalesce(1).write.parquet(target_path + '/train')
    b.coalesce(1).write.parquet(target_path + '/test')
    c.coalesce(1).write.parquet(target_path + '/validate')
    def build_static_features(self):
      '''
      Features:

      gender
      state
      useragent
      customer_age (for how long the customer has joined)
      '''
      gender = self.df.select('userId', F.when(F.col('gender')=="F", 1).otherwise(0).alias('female')).distinct()

      state = self.one_hot_encode(self.df.select('userId', F.explode(self.extract_state('location')).alias('state')).distinct(), 'state')

      customer_age = self.df\
                     .select('userId',self.clean_ts('registration').alias("registration"))\
                     .select('userId',
                             F.datediff(F.to_date(F.lit(self.max_time),"yyyy-MM-dd"), 
                                        F.to_date('registration',"yyyy-MM-dd")).alias('customer_age'))\
                     .distinct()

      agent_one_hot = self.one_hot_encode(self.df\
                                          .select('userId', F.explode(self.clean_userAgent('userAgent')))\
                                          .distinct()\
                                          .withColumnRenamed('col','agent'), 
                                          'agent')
      return gender\
            # .join(state, on="userId", how='outer')\
            .join(customer_age, on="userId", how='outer')\
            .join(agent_one_hot, on="userId", how='outer')
示例#12
0
def get_new_features(df: DataFrame):
    return (df.withColumn(
        "AgeGroup",
        f.concat((f.col("AgeGroup") * 5).cast("int").cast("string"),
                 f.lit("-"),
                 (f.col("AgeGroup") * 5 + 5).cast("int").cast("string")),
    ).withColumn(
        "MaturityDateDelay",
        f.datediff("MaturityDate_Last", "MaturityDate_Original")).withColumn(
            "AmountNotGranted",
            f.col("AppliedAmount") - f.col("Amount")).withColumn(
                "DaysToFirstPayment",
                f.datediff("FirstPaymentDate",
                           "LoanDate")).drop("MaturityDate_Original",
                                             "AppliedAmount",
                                             "FirstPaymentDate"))
示例#13
0
def transform_immigration_dataset(self):
    
    df_spark=spark.read.parquet('../workspace/immigration_files/sas_data/*.parquet')

    df_immi_fact=df_spark.fillna('Not available', subset=['gender','occup','i94addr']) 

    df_date=df_immi_fact.withColumn("arrival_date", udf_datetime_from_sas("arrdate")).withColumn("departure_date", udf_datetime_from_sas("depdate"))

    df_date=df_date.withColumn("cicid", df_date["cicid"].cast(IntegerType())) \
                    .withColumn("i94yr",df_date["i94yr"].cast(IntegerType())) \
                    .withColumn("i94mon",df_date["i94mon"].cast(IntegerType())) \
                    .withColumn("i94cit",df_date["i94cit"].cast(IntegerType())) \
                    .withColumn("i94res",df_date["i94res"].cast(IntegerType())) \
                    .withColumn("i94mode",df_date["i94mode"].cast(IntegerType())) \
                    .withColumn("i94addr",df_date["i94addr"])\
                    .withColumn("i94bir",df_date["i94bir"].cast(IntegerType()))\
                    .withColumn("i94visa",df_date["i94visa"].cast(IntegerType()))\
                    .withColumn("count",df_date["count"].cast(IntegerType()))\
                    .withColumn("biryear",df_date["biryear"].cast(IntegerType()))\
                    .withColumn("no_of_days_stayed", datediff(col("departure_date"),col("arrival_date")))     

    immigration=df_date.select('cicid','i94yr','i94mon',\
                               'i94cit','i94res','i94port','i94mode',\
                               'i94addr','i94bir','i94visa',\
                               'visapost','entdepa','biryear',\
                               'gender','airline','visatype','arrival_date',\
                               'departure_date','no_of_days_stayed')

    immigration.write.partitionBy("i94mon").mode("append").parquet(save_path +"/sasdata/")
示例#14
0
    def age_calculator(self, columns, age_colname):
        """
        @columns: a string of column name
        @age_colname: a string of the new column of age, e.g.'age'
        """
        assert isinstance(columns,
                          str), "Error: columns argument must be a string!"
        assert isinstance(
            age_colname,
            str), "Error: the name for age column argument must be a string!"

        df = self._df.select(
            columns,
            from_unixtime(unix_timestamp(columns,
                                         'MM/dd/yyy')).alias('with time'))
        df = df.withColumn('without time', df['with time'].cast(DateType()))
        df = df.withColumnRenamed(columns, 'tmp')
        df = df.withColumn(age_colname, (
            (datediff(current_date(), col('without time')).cast(FloatType())) /
            365).cast(IntegerType()))
        self._df = self._df.join(
            df,
            col(columns) == col('tmp'),
            'leftouter').drop('tmp').drop('with time').drop('without time')

        return self
def preprocessing(spark_df):
    smart_feature_columns=[column for column in spark_df.columns if 'smart' in column]


    window_spec_7 = Window.partitionBy('model', 'serial_number').orderBy(
        F.datediff(F.col('dt'), F.lit('2017-07-01'))).rangeBetween(-7, 0)
    prefix_window7='window_7_'
    for smart_col in smart_feature_columns:
        spark_df=spark_df.withColumn(smart_col,F.col(smart_col).cast(DoubleType()))
        if smart_col in ['smart_1_normalized','smart_5raw','smart_7_normalized','smart_194raw','smart_199raw',
                         'smart_190raw','smart_191raw','smart_193raw','smart_195_normalized','smart_195raw']:
            spark_df = spark_df.withColumn(prefix_window7 + 'range_' + smart_col,
                                         F.max(F.col(smart_col)).over(window_spec_7) - F.min(F.col(smart_col)).over(
                                             window_spec_7))
            spark_df = spark_df.withColumn(prefix_window7 + 'std_' + smart_col,
                                         F.stddev(F.col(smart_col)).over(window_spec_7))
        #if smart_col in ['smart_187raw','smart_188raw','smart_197raw','smart_198raw']:
        #    spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)>0,1).otherwise(0))
        #if smart_col in ['smart_187_normalized','smart_188_normalized','smart_197_normalized','smart_198_normalized']:
        #    spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)<100,1).otherwise(0))
        if smart_col in ['smart_4raw','smart_5raw','smart_191raw',
                         'smart_187raw','smart_197raw','smart_198raw',
                         'smart_199raw','window_7_range_smart_199raw']:
            spark_df=spark_df.withColumn(smart_col,F.log2(F.col(smart_col)+F.lit(1.)))

    spark_df=spark_df.withColumn('smart_199raw',F.col('smart_199raw')*F.col('window_7_range_smart_199raw'))

    spark_df = spark_df.withColumn('anomaly_sum',
                                   F.col('smart_4raw') / 12 + F.col('smart_5raw') / 16  + F.col('smart_191raw') / 18
                                    + F.col('smart_198raw')/18 +F.col('smart_197raw')/18+F.col('smart_187raw')/15)

    return spark_df
示例#16
0
def add_datediff(df, date_col, start_date):
    """添加日期序号
    """
    df = df.withColumn('datediff',
                       F.datediff(F.col(date_col), F.lit(start_date)))
    df = df.withColumn('datediff_square', F.pow(F.col('datediff'), 2))
    df = df.withColumn('datediff_square_root', F.pow(F.col('datediff'), 0.5))
    return df
示例#17
0
def get_target(df: DataFrame):
    w = Window.partitionBy("LoanID").orderBy("Date")
    return (df.withColumn(
        "nextPaymentDiff", f.datediff(
            "Date",
            f.lag("Date").over(w))).fillna(0, subset=[
                "nextPaymentDiff"
            ]).withColumn("DaysFromStart", f.datediff(
                "Date", "LoanDate")).filter(
                    f.col("DaysFromStart") <= default_prediction).withColumn(
                        "label",
                        f.when(f.col("nextPaymentDiff") < default_days,
                               0).otherwise(1)).withColumn(
                                   "FeaturesForPrediction",
                                   f.when(
                                       f.col("DaysFromStart") <=
                                       observation_period, 1).otherwise(0)))
示例#18
0
def countUserRegTime(accountdf, df):
    """统计某一天的用户注册时长
    从全部注册用户里筛选出当天上线的用户,算出2个日期差,
    然后统计不同日期差里的用户数量并进行排序
    """
    return df.join(accountdf, df.uid == accountdf.uid, 'inner').select(
        F.datediff(df.day, accountdf.regtime).alias('daydiff')).groupBy(
            'daydiff').count().sort(F.asc('daydiff'))
    def __init__(self, 
                 df):   

        '''
        Intake the event data; print out summary and build the first level cleaned data
        '''

        print("Numbers of rows in the data: {:,}".format(df.count()))
        print("Numbers of rows with N/A user Ids: {:,}".format(df.filter("userId is Null").count()))
        print("Numbers of unique customers: {:,}".format(df.select('userId').distinct().count()))

        self.df = df
        self.clean_userAgent = udf(self.clean_userAgent, ArrayType(StringType()))
        self.clean_ts = udf(self.clean_ts, StringType())
        self.clean_ts_hour = udf(self.clean_ts_hour, StringType())
        self.extract_state = udf(self.extract_state, ArrayType(StringType()))

        self.user_activities = self.df\
                                .filter("auth = 'Logged In'")\
                                .select('userId', 
                                        'registration',
                                        self.clean_ts('registration').alias("registration_ts"),
                                        'level',
                                        F.col('ts').alias('time'),
                                        self.clean_ts('ts').alias("timestamp"),
                                        self.clean_ts_hour('ts').alias("time_in_the_day"),
                                        'sessionId',
                                        'itemInSession',
                                        'page',
                                        'artist',
                                        F.concat(F.col('artist'), F.lit(" - ") , F.col('song')).alias("singer-song"),
                                        'length'
                                        )\
                                .withColumn('days_since_registration', 
                                            F.datediff(F.to_date('timestamp',"yyyy-MM-dd"), 
                                          F.to_date('registration_ts',"yyyy-MM-dd")))
        self.max_time = self.user_activities\
                        .groupby()\
                        .max('time')\
                        .select(self.clean_ts('max(time)').alias('max_time'))\
                        .collect()[0]['max_time']

        self.user_activities = self.user_activities.withColumn('days_before_today', 
                                                               F.datediff(F.to_date(F.lit(self.max_time),"yyyy-MM-dd"), 
                                                                          F.to_date('timestamp',"yyyy-MM-dd")))
def user_sessions_duration_less_than(duration: int, data: DataFrame):
    rows: List[Row] = data \
        .groupBy('session_id') \
        .agg(datediff(min(to_date(col('datetime'))), max(to_date(col('datetime')))).alias('session_duration')) \
        .filter(col('session_duration') <= duration) \
        .drop("session_duration").collect()

    users: List[str] = [str(row.session_id) for row in rows]
    return users
示例#21
0
def add_date_info_spark(df, convert=True):
    '''Take a spark df and add date info
    '''
    if convert:
        f_datestring = F.UserDefinedFunction(lambda x: x[:-4] + '+00:00',
                                             StringType())
        df = df.withColumn("first_event", f_datestring(df.first_event))
        df = df.withColumn("last_event", f_datestring(df.last_event))

    df = df.withColumn("first_event", F.to_timestamp(df.first_event))
    df = df.withColumn("last_event", F.to_timestamp(df.last_event))
    df = df.withColumn("created_at", F.to_timestamp(df.created_at))
    df = df.withColumn("updated_at", F.to_timestamp(df.updated_at))

    df = df.withColumn("recency", F.datediff(df.last_event, df.created_at))
    df = df.withColumn("time_between_first_last_event",
                       F.datediff(df.last_event, df.first_event))

    return df
示例#22
0
    def _discretize_time(self, column: sf.Column) -> sf.Column:
        days_since_study_start = sf.datediff(column, sf.lit(self.study_start))
        bucket = sf.floor(days_since_study_start /
                          self.bucket_size).cast("int")

        if self.bucket_rounding == "floor":
            bucket = (sf.when(
                (bucket < self.n_buckets) | bucket.isNull(),
                bucket).otherwise(self.n_buckets - 1).cast("int"))
        return bucket
def add_features(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    print("[wrangle.py] adding features")
    max_date = df.select(max("case_closed_date")).first()[0]
    return (
        df.withColumn("num_weeks_late", expr("num_days_late / 7 AS num_weeks_late"))
        .withColumn(
            "council_district",
            format_string("%03d", col("council_district").cast("int")),
        )
        .withColumn("zipcode", regexp_extract("request_address", r"\d+$", 0))
        .withColumn("case_age", datediff(lit(max_date), "case_opened_date"))
        .withColumn("days_to_closed", datediff("case_closed_date", "case_opened_date"))
        .withColumn(
            "case_lifetime",
            when(expr("! case_closed"), col("case_age")).otherwise(
                col("days_to_closed")
            ),
        )
    )
示例#24
0
def get_crm_cust_log(spark):
    #crm_cust_log 变更记录表
    # change_type:  CO_CUST.STATUS  状态变更
    #               CO_CUST.CUST_SEG  档位变更
    crm_cust_log = spark.sql("select cust_id,change_type,change_frm,change_to,audit_date from DB2_DB2INST1_CRM_CUST_LOG") \
        .withColumn("audit_date", f.to_date("audit_date", "yyyyMMdd")) \
        .withColumn("day_diff",f.datediff(f.current_date(), col("audit_date"))) \
        .withColumn("change_frm", f.regexp_replace(col("change_frm"), "(zz)|(ZZ)", "31")) \
        .withColumn("change_to", f.regexp_replace(col("change_to"), "(zz)|(ZZ)", "31"))
    return crm_cust_log
示例#25
0
def week_diff(end, start):
    """
         计算两个日期的相隔周数
    :param end:
    :param start:
    :return:
    """
    # date_trunc("week",date) 返回 所属周的周一的日期
    return datediff(date_trunc("week", col(end)), date_trunc(
        "week", col(start))) / 7
    def set_years_between_dates(self, dataframe, start_date, end_date, years_between_field):
        """Produces a PySpark dataframe containing a field representing the number of years between two specified fields.

        :param dataframe: the PySpark dataframe
        :param start_date: the field containing the start date to use in calculating the years between fields
        :param end_date: the field containing the end date to use in calculating the years between fields
        :param years_between_field: the name of the field to create
        :returns: the PySpark dataframe containing the new field and all fields in the supplied dataframe
        """
        return(dataframe.withColumn(years_between_field, datediff(dataframe[end_date], dataframe[start_date])/365.25))
示例#27
0
    def inner(df):
        from pyspark.sql.functions import lag, col, datediff

        w = Window().partitionBy().orderBy(col(colName))
        df1 = (df.select(colName,
                         lag(colName).over(w).alias("lagged_col")).na.drop())
        (df1.withColumn('diff_col', datediff(
            df1[colName], df1['lagged_col'])).sort('diff_col',
                                                   ascending=False).show())
        return df
示例#28
0
def i94_data(month, year):
    '''
    This function creates a parquet file from immigration data
    '''
    df_spark =spark.read.format('com.github.saurfang.sas.spark').load(f'../../data/18-83510-I94-Data-2016/i94_{month}{year}_sub.sas7bdat').drop_duplicates()
    df_spark = df_spark.where(col("dtadfile").isNotNull())
    df_spark = df_spark.withColumn('stayed_days', ( df_spark['depdate'] - df_spark['arrdate']))
    df_spark = df_spark.withColumn("allowed_stay_till",  to_date("dtaddto","MMddyyyy"))
    df_spark = df_spark.withColumn('date_created', to_date("dtadfile", 'yyyyMMdd'))
    df_spark = df_spark.withColumn("allowed_stay_days", datediff('allowed_stay_till', 'date_created'))
    df_spark.createOrReplaceTempView('i94')
    query = """SELECT cicid, 
                date_created, 
                i94yr AS year, 
                i94mon AS month,
                i94cit AS citizenship,
                i94res AS resident,
                i94bir AS age,
                biryear AS birth_year,
                gender,
                occup AS occupation,
                allowed_stay_till,
                allowed_stay_days,
                stayed_days,
                i94visa AS visa_class,
                visatype AS visa_type,
                i94port AS port,
                i94mode AS mode,
                i94addr AS arraval_state,
                visapost AS visa_issued_by,
                entdepa AS arrival_flag,
                entdepd AS depart_flag,
                entdepu AS update_flag,
                matflag AS match_flag,
                insnum AS ins_number,
                airline,
                admnum AS admission_number,
                fltno AS flight_number
           FROM i94
       """
    i94 = spark.sql(query)
    i94 = i94.withColumn("cicid", i94["cicid"].cast(IntegerType()))
    i94 = i94.withColumn("year", i94["year"].cast(IntegerType()))
    i94 = i94.withColumn("month", i94["month"].cast(IntegerType()))
    i94 = i94.withColumn("citizenship", i94["citizenship"].cast(IntegerType()))
    i94 = i94.withColumn("resident", i94["resident"].cast(IntegerType()))
    i94 = i94.withColumn("age", i94["age"].cast(IntegerType()))
    i94 = i94.withColumn("birth_year", i94["birth_year"].cast(IntegerType()))
    i94 = i94.withColumn("stayed_days", i94["stayed_days"].cast(IntegerType()))
    i94 = i94.withColumn("visa_class", i94["visa_class"].cast(IntegerType()))
    i94 = i94.withColumn("mode", i94["mode"].cast(IntegerType()))
    i94 = i94.withColumn("admission_number", i94["admission_number"].cast(IntegerType()))
    i94.write.mode('overwrite').partitionBy("month", "year").parquet(output_data + "immigration/")
    print('Immigration data was saved in parquet format on S3')
示例#29
0
def search_clients_daily(main_summary):
    return agg_search_data(
        main_summary,
        ["client_id", "submission_date", "engine", "source"],
        list(
            map(
                agg_first,
                [
                    "country",
                    "app_version",
                    "distribution_id",
                    "locale",
                    "user_pref_browser_search_region",
                    "search_cohort",
                    "addon_version",
                    "os",
                    "os_version",
                    "channel",
                    "profile_creation_date",
                    "default_search_engine",
                    "default_search_engine_data_load_path",
                    "default_search_engine_data_submission_url",
                    "sample_id",
                ],
            )
        )
        + [
            # Count of 'first' subsessions seen for this client_day
            (
                count(when(col("subsession_counter") == 1, 1)).alias(
                    "sessions_started_on_this_day"
                )
            ),
            first(
                datediff(
                    "subsession_start_date",
                    from_unixtime(col("profile_creation_date") * 24 * 60 * 60),
                )
            ).alias("profile_age_in_days"),
            sum(col("subsession_length") / 3600.0).alias("subsession_hours_sum"),
            mean(size("active_addons")).alias("active_addons_count_mean"),
            (
                max("scalar_parent_browser_engagement_max_concurrent_tab_count").alias(
                    "max_concurrent_tab_count_max"
                )
            ),
            (
                sum("scalar_parent_browser_engagement_tab_open_event_count").alias(
                    "tab_open_event_count_sum"
                )
            ),
            (sum(col("active_ticks") * 5 / 3600.0).alias("active_hours_sum")),
        ],
    )
示例#30
0
def add_duration_column(df):
    """
    Calculate the duration between arrival date and departement date and store in column.

    Parameters
    ----------
    df: pyspark.sql.dataframe.DataFrame
    """
    logger.info('adding duration column')
    df = df.withColumn("arrival_dt", F.expr("date_add(to_date('1960-01-01'), arrdate)"))
    df = df.withColumn("depart_dt", F.expr("date_add(to_date('1960-01-01'), depdate)"))
    return df.withColumn("length_stay", F.datediff("depart_dt", "arrival_dt"))
示例#31
0
def weigthed_size(min_date, max_date, begin, end):
    """
    A vectorized approach to calcule the weigth of a size in a given period.
    @param x spark dataframe
    @param begin first day of the period (as lit column).
    @param end last day of the period (as lit column).
    """
    _start = when(min_date >= begin, min_date).otherwise(begin)
    _end = when((max_date < end), max_date).otherwise(end)
    delta = datediff(
        from_unixtime(unix_timestamp(_end, "yyyyMMdd")),
        from_unixtime(unix_timestamp(_start, "yyyyMMdd")),
    ) + lit(1)
    delta = when((max_date < begin) | (min_date > end),
                 lit(0)).otherwise(delta)
    period = datediff(
        from_unixtime(unix_timestamp(end, "yyyyMMdd")),
        from_unixtime(unix_timestamp(begin, "yyyyMMdd")),
    ) + lit(1)
    x = delta.cast(DoubleType()) / period.cast(DoubleType())
    return x
示例#32
0
def get_lot_purchases(timeframe, partner, purchase):
    """
    returns the distribution of length of trip for farekeep sold by a partner over a time period ending today
    input: a period (string), a partner (string), a spark dataframe
    output: a dictionary with the bucket and the values associated
    """
    result = {}
    timeframe_is = get_date(timeframe)
    new_df = (purchase.filter(purchase.keen.timestamp >= timeframe_is)
              .filter(purchase.search_info.partner_id == partner)
              .filter(purchase.flight.inbound.departure_datetime != 'NaN')
              .withColumn("length_of_trip", datediff(purchase.flight.inbound.departure_datetime, purchase.flight.outbound.departure_datetime)))
    lots = new_df.groupBy('length_of_trip').sum('purchase.quantity').collect()
    for row in lots:
        result[row[0]] = row[1]
    return result
示例#33
0
def get_lot_searches(timeframe, partner, premium):
    """
    returns the distribution of length of trip for farekeep searched by a partner over a time period ending today
    input: a period (string), a partner (string), a spark dataframe
    output: a dictionary with the bucket and the values associated
    """
    result = {}
    timeframe_is = get_date(timeframe)
    # look for datediff in spark (working with string)
    new_df = (premium.filter(premium.keen.timestamp >= timeframe_is)
              .filter(premium.search_info.partner_id == partner)
              .filter(premium.flight.is_one_way == False)
              .withColumn("length_of_trip", datediff(premium.flight.return_date, premium.flight.departure_date)))
    lots = new_df.groupBy('length_of_trip').sum('flight.ticket_quantity').collect()
    for row in lots:
        result[row[0]] = row[1]
    return result
示例#34
0
def featurizeData(raw, gap, vocabFile, featFile):
    feats = raw.dropDuplicates(['cluster', 'series', 'date'])\
            .withColumn('day', datediff(col('date'), lit('1970-01-01')))\
            .na.drop(subset=['day'])\
            .rdd.groupBy(lambda r: r.cluster)\
            .flatMap(lambda c: clusterFeatures(c, gap))\
            .toDF()

    feats.cache()
    cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0)
    interner = cv.fit(feats)      # alternate possibility: grab features only from label==1 edges
    full = interner.transform(feats)
    # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features')
    # # I don't think a Pipeline will work here since we need to get the interner.vocabulary
    # full = combiner.transform(interner.transform(feats)).drop('categorial')

    full.write.parquet(featFile)
    np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s')
    feats.unpersist()
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
  .select(to_date(col("date"))).show(1)


# COMMAND ----------