Exemplo n.º 1
0
def add_hpg_reserve_features(data):
    # hpg reservation handle
    data['hr']['visit_datetime'] = pd.to_datetime(data['hr']['visit_datetime'])
    data['hr']['reserve_datetime'] = pd.to_datetime(
        data['hr']['reserve_datetime'])
    data['hr']['visit_date'] = data['hr']['visit_datetime'].dt.date
    data['hr']['visit_time'] = data['hr']['visit_datetime'].dt.hour
    data['hr']['reserve_date'] = data['hr']['reserve_datetime'].dt.date
    data['hr']['reserve_time'] = data['hr']['reserve_datetime'].dt.hour
    data['hr']['reserve_to_visit_dow'] = data['hr'][
        'visit_datetime'].dt.dayofweek
    data['hr']['reserve_ahead_in_days'] = (data['hr']['visit_date'] -
                                           data['hr']['reserve_date']).dt.days

    data['hr']['reserve_date'] = data['hr']['reserve_date'].apply(
        lambda x: str(x))
    data['hr']['visit_date'] = data['hr']['visit_date'].apply(lambda x: str(x))

    data['hr_temp'] = data['hr'].groupby(['hpg_store_id', 'visit_date']). \
        agg({'reserve_visitors': 'count'}).reset_index()
    data['hr_temp'] = pd.merge(left=data['id'],
                               right=data['hr_temp'],
                               on='hpg_store_id',
                               how='left')

    agg_func = lambda x: x.value_counts().index[0]
    agg_func.__name__ = 'mode'
    groupby_cols = ['hpg_store_id']
    method_dict = {
        'reserve_visitors': ['mean', 'max', 'min', 'count', 'sum'],
        'reserve_ahead_in_days': ['mean', 'max', 'min'],
        'reserve_to_visit_dow': [agg_func]
    }

    data['hr_temp'] = add_groupby_features(data['hr'], data['hr_temp'],
                                           groupby_cols, method_dict)

    # reserve_visitors aggregate on date
    groupby_cols = ['visit_date']
    method_dict = {
        'reserve_visitors': ['mean', 'max', 'min', 'count', 'sum'],
        'reserve_ahead_in_days': ['mean', 'max', 'min'],
    }

    data['hr_temp'] = add_groupby_features(data['hr'], data['hr_temp'],
                                           groupby_cols, method_dict)

    # reserve_visitors aggregate on store & date
    groupby_cols = ['visit_date', 'hpg_store_id']
    method_dict = {
        'reserve_visitors': ['mean', 'max', 'min', 'count', 'sum'],
        'reserve_ahead_in_days': ['mean', 'max', 'min'],
    }

    data['hr_temp'] = add_groupby_features(data['hr'], data['hr_temp'],
                                           groupby_cols, method_dict)
Exemplo n.º 2
0
def add_air_reserve_features(data):
    # reservation correlated to store | time, and store & time

    # reserve data handle
    data['ar']['visit_datetime'] = pd.to_datetime(data['ar']['visit_datetime'])
    data['ar']['reserve_datetime'] = pd.to_datetime(
        data['ar']['reserve_datetime'])
    data['ar']['visit_date'] = data['ar']['visit_datetime'].dt.date
    data['ar']['visit_time'] = data['ar']['visit_datetime'].dt.hour
    data['ar']['reserve_date'] = data['ar']['reserve_datetime'].dt.date
    data['ar']['reserve_time'] = data['ar']['reserve_datetime'].dt.hour
    data['ar']['reserve_to_visit_dow'] = data['ar'][
        'visit_datetime'].dt.dayofweek
    data['ar']['reserve_ahead_in_days'] = (data['ar']['visit_date'] -
                                           data['ar']['reserve_date']).dt.days

    data['ar']['reserve_date'] = data['ar']['reserve_date'].apply(
        lambda x: str(x))
    data['ar']['visit_date'] = data['ar']['visit_date'].apply(lambda x: str(x))

    # reserve_visitors aggregate on store
    agg_func = lambda x: x.value_counts().index[0]
    agg_func.__name__ = 'mode'
    groupby_cols = ['air_store_id']
    method_dict = {
        'reserve_visitors': ['mean', 'max', 'min', 'count', 'sum'],
        'reserve_ahead_in_days': ['mean', 'max', 'min'],
        'reserve_to_visit_dow': [agg_func]
    }

    data['full'] = add_groupby_features(data['ar'], data['full'], groupby_cols,
                                        method_dict)

    # reserve_visitors aggregate on date
    groupby_cols = ['visit_date']
    method_dict = {
        'reserve_visitors': ['mean', 'max', 'min', 'count', 'sum'],
        'reserve_ahead_in_days': ['mean', 'max', 'min'],
    }

    data['full'] = add_groupby_features(data['ar'], data['full'], groupby_cols,
                                        method_dict)

    # reserve_visitors aggregate on store & date
    groupby_cols = ['visit_date', 'air_store_id']
    method_dict = {
        'reserve_visitors': ['mean', 'max', 'min', 'count', 'sum'],
        'reserve_ahead_in_days': ['mean', 'max', 'min'],
    }

    data['full'] = add_groupby_features(data['ar'], data['full'], groupby_cols,
                                        method_dict)
Exemplo n.º 3
0
def add_visit_time_features(data: dict, ntrain):
    # Note : no null value generated from this operation

    # Add features
    data['full']['visit_date'] = pd.to_datetime(data['full']['visit_date'])
    data['full']['day'] = data['full']['visit_date'].dt.day
    data['full']['month'] = data['full']['visit_date'].dt.month
    data['full']['year'] = data['full']['visit_date'].dt.year
    data['full']['dow'] = data['full']['visit_date'].dt.dayofweek
    data['full']['doy'] = data['full'][
        'visit_date'].dt.dayofyear  # required np.log1p
    data['full']['woy'] = data['full']['visit_date'].dt.weekofyear
    data['full']['is_weekend'] = data['full']['dow'].isin(
        [4, 5, 6])  # Friday/Saturday/Sunday
    data['full']['is_golden_week'] = data['full']['visit_date'].between('04-29-2016', '05-07-2016') | \
                                     data['full']['visit_date'].between('04-29-2017', '05-07-2017')  # 4/29 - 5/7
    data['full']['is_month_end'] = data['full']['visit_date'].dt.is_month_end
    data['full']['is_month_str'] = data['full']['visit_date'].dt.is_month_start

    # aggregation by visitors on various time-related features (share the same method_dict)
    method_dict = {
        'visitors': ['mean', 'max', 'min', 'median', 'count', 'sum']
    }

    groupby_cols = ['dow']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    groupby_cols = ['is_weekend']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    groupby_cols = ['is_golden_week']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    groupby_cols = ['is_month_str']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    # aggregation by visitors on store
    groupby_cols = ['air_store_id']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    # agg on both
    groupby_cols = ['dow', 'air_store_id']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    groupby_cols = ['is_golden_week', 'air_store_id']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    # cast to str
    data['full']['visit_date'] = data['full']['visit_date'].dt.date.apply(
        lambda x: str(x))
Exemplo n.º 4
0
def add_full_data_feature(data, ntrain):
    # More features
    merge_dict = {
        'GP_air_store_id_ON_reserve_visitors_mean': 'mean',
        'GP_air_store_id_ON_reserve_visitors_max': 'max',
        'GP_air_store_id_ON_reserve_visitors_min': 'min',
        'GP_air_store_id_ON_reserve_visitors_count': 'sum',
        'GP_air_store_id_ON_reserve_visitors_sum': 'sum',
        'GP_air_store_id_ON_reserve_ahead_in_days_mean': 'mean',
        'GP_air_store_id_ON_reserve_ahead_in_days_max': 'max',
        'GP_air_store_id_ON_reserve_ahead_in_days_min': 'min',
        'GP_visit_date_ON_reserve_visitors_mean_air': 'mean',
        'GP_visit_date_ON_reserve_visitors_max_air': 'max',
        'GP_visit_date_ON_reserve_visitors_min_air': 'min',
        'GP_visit_date_ON_reserve_visitors_count_air': 'sum',
        'GP_visit_date_ON_reserve_visitors_sum_air': 'sum',
        'GP_visit_date_ON_reserve_ahead_in_days_mean_air': 'mean',
        'GP_visit_date_ON_reserve_ahead_in_days_max_air': 'max',
        'GP_visit_date_ON_reserve_ahead_in_days_min_air': 'min',
        'GP_visit_date_air_store_id_ON_reserve_visitors_mean': 'mean',
        'GP_visit_date_air_store_id_ON_reserve_visitors_max': 'max',
        'GP_visit_date_air_store_id_ON_reserve_visitors_min': 'min',
        'GP_visit_date_air_store_id_ON_reserve_visitors_count': 'sum',
        'GP_visit_date_air_store_id_ON_reserve_visitors_sum': 'sum',
        'GP_visit_date_air_store_id_ON_reserve_ahead_in_days_mean': 'mean',
        'GP_visit_date_air_store_id_ON_reserve_ahead_in_days_max': 'max',
        'GP_visit_date_air_store_id_ON_reserve_ahead_in_days_min': 'min',
    }

    merge_numericals(data['full'], merge_dict, drop=True)

    data['full']['lon_plus_lat_air'] = data['full']['longitude_air'] + data[
        'full']['latitude_air']

    data['full']['lat_to_mean_lat_air_lv1'] = abs(
        data['full']['latitude_air'] -
        data['full']['GP_air_lv1_ON_latitude_mean'])
    data['full']['lat_to_max_lat_air_lv1'] = data['full'][
        'latitude_air'] - data['full']['GP_air_lv1_ON_latitude_max']
    data['full']['lat_to_min_lat_air_lv1'] = data['full'][
        'latitude_air'] - data['full']['GP_air_lv1_ON_latitude_min']
    data['full']['lon_to_mean_lon_air_lv1'] = abs(
        data['full']['longitude_air'] -
        data['full']['GP_air_lv1_ON_longitude_mean'])
    data['full']['lon_to_max_lon_air_lv1'] = data['full'][
        'longitude_air'] - data['full']['GP_air_lv1_ON_longitude_max']
    data['full']['lon_to_min_lon_air_lv1'] = data['full'][
        'longitude_air'] - data['full']['GP_air_lv1_ON_longitude_min']
    data['full']['lat_to_mean_lat_air_lv2'] = abs(
        data['full']['latitude_air'] -
        data['full']['GP_air_lv2_ON_latitude_mean'])
    data['full']['lat_to_max_lat_air_lv2'] = data['full'][
        'latitude_air'] - data['full']['GP_air_lv2_ON_latitude_max']
    data['full']['lat_to_min_lat_air_lv2'] = data['full'][
        'latitude_air'] - data['full']['GP_air_lv2_ON_latitude_min']
    data['full']['lon_to_mean_lon_air_lv2'] = abs(
        data['full']['longitude_air'] -
        data['full']['GP_air_lv2_ON_longitude_mean'])
    data['full']['lon_to_max_lon_air_lv2'] = data['full'][
        'longitude_air'] - data['full']['GP_air_lv2_ON_longitude_max']
    data['full']['lon_to_min_lon_air_lv2'] = data['full'][
        'longitude_air'] - data['full']['GP_air_lv2_ON_longitude_min']

    data['full']['lat_to_mean_lat_hpg_lv1'] = abs(
        data['full']['latitude_hpg'] -
        data['full']['GP_hpg_lv1_ON_latitude_mean'])
    data['full']['lat_to_max_lat_hpg_lv1'] = data['full'][
        'latitude_hpg'] - data['full']['GP_hpg_lv1_ON_latitude_max']
    data['full']['lat_to_min_lat_hpg_lv1'] = data['full'][
        'latitude_hpg'] - data['full']['GP_hpg_lv1_ON_latitude_min']
    data['full']['lon_to_mean_lon_hpg_lv1'] = abs(
        data['full']['longitude_hpg'] -
        data['full']['GP_hpg_lv1_ON_longitude_mean'])
    data['full']['lon_to_max_lon_hpg_lv1'] = data['full'][
        'longitude_hpg'] - data['full']['GP_hpg_lv1_ON_longitude_max']
    data['full']['lon_to_min_lon_hpg_lv1'] = data['full'][
        'longitude_hpg'] - data['full']['GP_hpg_lv1_ON_longitude_min']
    data['full']['lat_to_mean_lat_hpg_lv2'] = abs(
        data['full']['latitude_hpg'] -
        data['full']['GP_hpg_lv2_ON_latitude_mean'])
    data['full']['lat_to_max_lat_hpg_lv2'] = data['full'][
        'latitude_hpg'] - data['full']['GP_hpg_lv2_ON_latitude_max']
    data['full']['lat_to_min_lat_hpg_lv2'] = data['full'][
        'latitude_hpg'] - data['full']['GP_hpg_lv2_ON_latitude_min']
    data['full']['lon_to_mean_lon_hpg_lv2'] = abs(
        data['full']['longitude_hpg'] -
        data['full']['GP_hpg_lv2_ON_longitude_mean'])
    data['full']['lon_to_max_lon_hpg_lv2'] = data['full'][
        'longitude_hpg'] - data['full']['GP_hpg_lv2_ON_longitude_max']
    data['full']['lon_to_min_lon_hpg_lv2'] = data['full'][
        'longitude_hpg'] - data['full']['GP_hpg_lv2_ON_longitude_min']

    method_dict = {'visitors': ['mean', 'max', 'min', 'median']}

    groupby_cols = ['air_store_id', 'dow', 'holiday_flg']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    groupby_cols = ['air_store_id', 'is_golden_week']
    data['full'] = add_groupby_features(data['full'][:ntrain], data['full'],
                                        groupby_cols, method_dict)

    agg_func = lambda x: ((x[1] * x[0]).sum() / x[1])

    # cast datetimes from object to datetime
    data['full'].visit_date = pd.to_datetime(data['full'].visit_date)
Exemplo n.º 5
0
def add_hpg_store_feature(data):
    data['hs']['hpg_lv1'] = data['hs'].hpg_area_name.apply(
        lambda x: x.split(" ")[0])
    data['hs']['hpg_lv2'] = data['hs'].hpg_area_name.apply(
        lambda x: x.split(" ")[1])
    data['hs']['hpg_lv3'] = data['hs'].hpg_area_name.apply(
        lambda x: "".join(x.split(" ")[2:]))

    # Count aggregation on various geo-related features
    method_dict = {'hpg_store_id': ['count']}

    groupby_cols = ['latitude', 'longitude']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_lv1']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_lv1', 'hpg_lv2']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_lv1', 'hpg_lv2', 'hpg_lv3']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_genre_name']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_genre_name', 'hpg_lv1']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_genre_name', 'hpg_lv1', 'hpg_lv2']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_genre_name', 'hpg_lv1', 'hpg_lv2', 'hpg_lv3']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    # group locations on different geo-levels
    method_dict = {
        'latitude': ['mean', 'max', 'min'],
        'longitude': ['mean', 'max', 'min']
    }

    groupby_cols = ['hpg_lv1']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_lv2']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)

    groupby_cols = ['hpg_lv3']
    data['hs'] = add_groupby_features(data['hs'], data['hs'], groupby_cols,
                                      method_dict)