Пример #1
0
 def extract_feature(self):
     train = data.train_df(mode=self.mode, cluster=self.cluster)
     test = data.test_df(mode=self.mode, cluster=self.cluster)
     df = pd.concat([train, test])
     # remove last clks and last part of session
     new_df = remove_last_part_of_clk_sessions(df)
     new_df = new_df.drop(find(new_df))
     no_last_clks_numeric = new_df[
         new_df.reference.str.isnumeric() == True][[
             'user_id', 'session_id', 'action_type', 'reference'
         ]]
     # we want to make it fast, avoid any loops...
     # simply drop duplicates and mantain last occurence
     # of the tuple user-session-item :D
     last_actions = no_last_clks_numeric.drop_duplicates(
         ['user_id', 'session_id', 'reference'], keep='last')
     last_actions = last_actions.rename(
         columns={
             'reference': 'item_id',
             'action_type': 'last_action_involving_impression'
         })
     last_actions.item_id = last_actions.item_id.astype(int)
     # get last clickouts and expand
     last_clk = df.loc[find(df)]
     clk_expanded = expand_impressions(last_clk)[[
         'user_id', 'session_id', 'item_id'
     ]]
     # now simply merge and fill NaNs with 'no_action' as in the original feature
     feature = pd.merge(clk_expanded,
                        last_actions,
                        how='left',
                        on=['user_id', 'session_id', 'item_id'])
     feature.last_action_involving_impression = feature.last_action_involving_impression.astype(
         object).fillna('no_action')
     return feature
Пример #2
0
def remove_last_part_of_clk_sessions(df):
    """
    This function takes a dataframe and removes the interactions that
    occur after the last clickout of each session.
    """
    df = df.sort_values(
        by=['user_id', 'session_id', 'timestamp', 'step']).reset_index(
            drop=True)
    last_indices = find(df)
    last_clks = df.loc[last_indices]
    clks_sessions = last_clks.session_id.unique().tolist()
    clks_users = last_clks.user_id.unique().tolist()
    df_last_clks_sess_only = df[(df.session_id.isin(clks_sessions))
                                & (df.user_id.isin(clks_users))][[
                                    'user_id', 'session_id', 'action_type'
                                ]]
    df_last_clks_sess_only_no_dupl = df_last_clks_sess_only.drop_duplicates(
        ['user_id', 'session_id'])
    df_last_clks_sess_only_no_dupl['last_index'] = sorted(last_indices)
    df_last_clks_sess_only_no_dupl = df_last_clks_sess_only_no_dupl.drop(
        'action_type', 1)
    merged = pd.merge(df_last_clks_sess_only,
                      df_last_clks_sess_only_no_dupl,
                      how='left',
                      on=['user_id', 'session_id'
                          ]).set_index(df_last_clks_sess_only.index)
    indices_to_remove = []
    for t in tqdm(zip(merged.index, merged.last_index)):
        if t[0] > t[1]:
            indices_to_remove.append(t[0])
    return df.drop(indices_to_remove)
    def extract_feature(self):
        o = ImpressionFeature(self.mode)
        f = o.read_feature()
        f = f.drop(['properties'], axis=1)
        f['popularity'] = 0
        pop = dict(zip(f.item_id.values, f.popularity.values))

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        df_dropped_last_clickouts = df.drop(last_clickout_indices)
        df_no_last_clickouts = df_dropped_last_clickouts[
            (df_dropped_last_clickouts.action_type == 'clickout item')
            & ~(df_dropped_last_clickouts.reference.isnull())]
        references = df_no_last_clickouts.reference.values

        for r in references:
            pop[int(r)] += 1

        final_df = pd.DataFrame(
            list(pop.items()),
            columns=['item_id', 'top_pop_interaction_clickout_per_impression'])

        return final_df
    def extract_feature(self):
        self.current_directory = Path(__file__).absolute().parent
        self.data_dir = self.current_directory.joinpath(
            '..', '..', 'stacking', self.mode)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_indices = find(df)

        # extract scores
        self.train_dir = self.data_dir.joinpath('test')
        for file in glob.glob(str(self.train_dir) + '/rnn*'):
            rnn = np.load(file)
            rnn = pd.DataFrame(
                rnn, columns=['index', 'item_recommendations', 'scores'])
            rnn = rnn.astype({'index': int})
            rnn = rnn[rnn['index'].isin(last_indices)]

        rnn_idx = list(rnn['index'])
        print(f'Rnn indices are : {len(set(rnn_idx))}')
        print(f'Last indices are : {len((last_indices))}')
        common = set(rnn_idx) & set(last_indices)
        print(f'In common : {len(common)}')

        t = assign_score(rnn, 'rnn')
        t = t.sort_values(by='index')

        df['index'] = df.index.values
        df = df[['user_id', 'session_id', 'index']]
        df = pd.merge(t, df, how='left', on=['index'])
        num_idx = len(set(df['index'].values))
        print(num_idx)
        return df[['user_id', 'session_id', 'item_id', 'score_rnn']]
    def save_folds(df, user_session_df, train_index, test_index, count, mode):
        u_s_train = list(
            user_session_df.loc[train_index]['user_session'].values)
        u_s_test = list(user_session_df.loc[test_index]['user_session'].values)

        path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode)
        check_folder(path)

        train = df[df['user_session'].isin(u_s_train)]
        train = train.drop(['user_session'], axis=1)
        train.to_csv(os.path.join(path, 'train.csv'))
        train_indices = train.index.values
        np.save(os.path.join(path, 'train_indices'), train_indices)

        test = df[df['user_session'].isin(u_s_test)]
        target_indices = sorted(find(test))
        test.at[target_indices, 'reference'] = np.nan
        test = test.drop(['user_session'], axis=1)
        test.to_csv(os.path.join(path, 'test.csv'))
        test_indices = test.index.values
        np.save(os.path.join(path, 'test_indices'), test_indices)
        np.save(os.path.join(path, 'target_indices'), target_indices)

        print(f'Train shape : {train.shape} , Test shape : {test.shape}')
        print(f'Last clickout indices : {len(target_indices)}')
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','action_type','impressions']]
        reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')]

        df_item_clicks = (
            reference_rows
            .groupby(["reference"])
            .size()
            .reset_index(name="n_interactions_per_item")
        )
        df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)
        #df_item_clicks

        clk_expanded = expand_impressions(clickout_rows)
        final_feature = pd.merge(clk_expanded, df_item_clicks, how='left', on=['item_id']).fillna(0)
        final_feature.n_interactions_per_item = final_feature.n_interactions_per_item.astype(int)
        final_feature = final_feature.drop(['index'], axis=1)

        final_feature.reference = final_feature.reference.astype(int)
        new_column = []
        for t in zip(final_feature.item_id, final_feature.reference, final_feature.n_interactions_per_item):
            if t[0] == t[1]:
                new_column.append(int(t[2]-1))
            else:
                new_column.append(int(t[2]))
        final_feature['personalized_popularity'] = new_column

        final_feature_reduced = final_feature[['user_id','session_id','item_id','personalized_popularity']]

        return final_feature_reduced
Пример #7
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get all the cities
        cities = df['city'].unique().tolist()
        # get clickout rows (WITHOUT last clk)
        last_indices = find(df)
        df_non_last_clk = df.drop(last_indices)
        df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','city']]
        df_clickout = df_clickout.rename(columns={'reference':'item_id'})
        df_clickout = df_clickout.dropna() # remove NaNs, that should not be there anywayss
        df_clickout.item_id = df_clickout.item_id.astype(int)
        # open impressions df
        o = ImpressionFeature(mode='small')
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1)
        # get all clicks properties
        df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id'])
        df_clicks_properties = df_clicks_properties.sort_values(by=['city'])
        df_clicks_properties = df_clicks_properties.drop('item_id',1)
        # sum all properties per city
        grouped_by_city = df_clicks_properties.groupby('city').sum()
        # create df with city:array_of_features
        df_city_features = pd.DataFrame(columns=['city','properties_array'])
        df_city_features.city = grouped_by_city.index
        df_city_features.properties_array = grouped_by_city.values.tolist()
        # now take last clk df
        clickout_rows = df.loc[last_indices,
                       ['user_id','session_id','city','action_type','impressions']][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)
        clk_expanded_wt_city_feat = pd.merge(clk_expanded, df_city_features, how='left', on=['city'])
        # create df with item:array_of_features
        array = df_accomodations.drop(['item_id'],axis=1).values
        df_item_features = pd.DataFrame(columns=['item_id','features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)
        final_feature = pd.merge(clk_expanded_wt_city_feat, df_item_features, how='left', on=['item_id'])
        for n in tqdm(final_feature[final_feature['properties_array'].isnull()].index.tolist()):
            final_feature.at[n,'properties_array'] = [0]*152
        # cast list to numpy array to use the cosine (it's written for doubles)
        final_feature.properties_array = final_feature.properties_array.progress_apply(lambda x: np.asarray(x))
        # create new column
        new_col =[]
        if self.metric == 'cosine':
            shrink = 0 # TRY ME
            for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)):
                new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink))
        if self.metric == 'euclidean':
            for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)):
                new_col.append(np.linalg.norm(t[0]-t[1]))
        # final feature
        new_feature = final_feature[['user_id','session_id','item_id']]
        new_feature['city_similarity'] = new_col

        return new_feature
    def extract_feature(self):

        list_of_sorting_filters_wout_pop = [
            'Sort by Price', 'Sort by Distance', 'Sort by Rating',
            'Best Value', 'Focus on Rating', 'Focus on Distance'
        ]

        list_of_sorting_filters = [
            'Sort by Price', 'Sort by Distance', 'Sort by Rating',
            'Best Value', 'Focus on Rating', 'Focus on Distance',
            'Sort by Popularity'
        ]

        def mask_sorting(x):
            if np.isin(x, list_of_sorting_filters_wout_pop).any():
                return x
            else:
                return ['Sort by Popularity']

        start = time.time()
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        indices_last_clks = find(df)
        d = df[df.action_type == 'clickout item'].drop(indices_last_clks)
        d_splitted = d.current_filters.progress_apply(
            lambda x: str(x).split('|'))
        md = d_splitted.progress_apply(mask_sorting)
        df_f = df.loc[md.index]
        df_ref = df_f.reference
        dict_ref_to_filters = dict(
            zip(df_ref.unique(), [dict(zip(list_of_sorting_filters, np.zeros(len(list_of_sorting_filters))))\
                                     for i in range(len(df_ref.unique()))]))

        for index, row in tqdm(df_f.iterrows(), total=df_f.shape[0]):
            for i in md.loc[index]:
                if i in list_of_sorting_filters:
                    dict_ref_to_filters[row.reference][i] += 1
        df_feature = pd.DataFrame.from_dict(dict_ref_to_filters,
                                            orient='index')
        df_feature = df_feature.astype(int).reset_index().rename(
            index=str, columns={"index": "item_id"})
        set_of_not_clicked_items = set(data.accomodations_df().item_id) - set(
            df_feature.item_id)
        extension = pd.DataFrame(data=sorted(
            [i for i in set_of_not_clicked_items]),
                                 columns=['item_id'])
        extd = df_feature.append(extension, ignore_index=True, sort=True)
        f = extd.fillna(0).reset_index().drop(columns=['index'])
        feature = f[np.insert(f.columns[:-1].values, 0,
                              f.columns[-1])].astype(int)

        _time = time.time() - start
        elapsed = time.strftime('%Mm %Ss', time.gmtime(_time))
        print(f"elapsed in: {elapsed}")
        return feature
Пример #9
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get ALL clickouts
        reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type =='clickout item')][['user_id','session_id','reference','impressions']]
        # get last clickout
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        # get the impressions
        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists for x in l]
        c = dict(Counter(big_list))

        df_times_in_impressions = pd.DataFrame.from_dict(c, orient='index',columns=['number_of_times_in_impr'])
        df_times_in_impressions['item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(columns = ['item_id', 'number_of_times_in_impr'])

        feature_times_per_imp = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0)
        feature_times_per_imp.number_of_times_in_impr = feature_times_per_imp.number_of_times_in_impr.astype(int)
        feature_times_per_imp = feature_times_per_imp[['user_id', 'session_id','item_id','number_of_times_in_impr']]

        df_item_clicks = (
            reference_rows
            .groupby(["reference"])
            .size()
            .reset_index(name="n_clickouts")
        )
        df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)
        merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0)
        merged.n_clickouts = merged.n_clickouts.astype(int)

        final_feature = pd.merge(clk_expanded, merged, how='left', on=['item_id']).fillna(0)
        new_col = []
        final_feature.reference = final_feature.reference.astype(int)
        final_feature.item_id = final_feature.item_id.astype(int)
        for t in tqdm(zip(final_feature.reference, final_feature.item_id,
                     final_feature.number_of_times_in_impr, final_feature.n_clickouts)):
            if t[0]==t[1]: # stessa reference, quindi decremento di 1 sia #click che #imp
                if t[2]!=1:
                    new_col.append(round(((t[3]-1)*100)/(t[2]-1),5))
                else:
                    new_col.append(0)
            else:
                if 0 not in [t[2],t[3]] and t[2]!=1:
                    new_col.append(round(((t[3])*100)/(t[2]-1),5))
                else:
                    new_col.append(0)
        final_feature['adj_perc_click_appeared'] = new_col
        final_feature = final_feature[['user_id','session_id','item_id','adj_perc_click_appeared']]

        return final_feature
    def extract_feature(self):

        def get_pos(item, rec):
            res = np.empty(item.shape)
            for i in tqdm(range(len(item))):
                if str(item[i]) in rec[i]:
                    res[i] = rec[i].index(str(item[i])) + 1
                else:
                    res[i] = -1
            return res.astype(int)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        all_clk_rows = df[df.reference.str.isnumeric()==True][df.action_type == 'clickout item']
        all_clk_rows = all_clk_rows [['user_id','session_id','reference','impressions']]

        all_clk_rows.impressions = all_clk_rows.impressions.str.split('|')
        pos_col = get_pos(all_clk_rows.reference.values,all_clk_rows.impressions.values)
        all_clk_rows = all_clk_rows.drop('impressions',1)
        all_clk_rows['position'] = pos_col
        all_clk_rows_after_1 = all_clk_rows[all_clk_rows.position>1]

        df_clicks_after_1 = (
            all_clk_rows_after_1
            .groupby(["reference"])
            .size()
            .reset_index(name="n_clicks_per_item")
        )
        df_clicks_after_1.reference = df_clicks_after_1.reference.astype(int)
        df_clicks_after_1 = df_clicks_after_1.rename(columns={'reference':'item_id'})

        last_clk_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']]
        last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|')
        clk_expanded = expand_impressions(last_clk_rows)
        clk_expanded = clk_expanded.drop('index',1)

        pos_col = get_pos(clk_expanded.item_id.values,clk_expanded.imp_list.values)
        clk_expanded['position'] = pos_col
        clk_expanded = clk_expanded.drop('imp_list',1)

        merged = pd.merge(clk_expanded, df_clicks_after_1, how='left',on='item_id').fillna(0)
        new_col = []
        merged.item_id = merged.item_id.astype(int)
        merged.reference = merged.reference.astype(int)
        for t in tqdm(zip(merged.item_id, merged.reference, merged.position, merged.n_clicks_per_item)):
            if t[0]==t[1] and t[2]>1:
                new_col.append(int(t[3]-1))
            else:
                new_col.append(int(t[3]))

        merged['n_clicks_after_first_pos'] = new_col
        feature = merged[['user_id','session_id','item_id','n_clicks_after_first_pos']]
        return feature
    def _fit(self, mode):
        """
        train, test and target indices are just sessions which have:
        - no num ref
        - more than 1 step
        """
        def RepresentsInt(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        train = data.train_df(mode)
        train_index = train.index.values
        test = data.test_df(mode)
        test_index = test.index.values
        tgt_indices = data.target_indices(mode)
        df = pd.concat([train, test])
        del train
        del test
        lst_clk_indices = sorted(find(df))

        to_return = []
        for idx in lst_clk_indices:
            usr_sess_indices = []
            try:
                a_user = df.at[idx, 'user_id']
                a_sess = df.at[idx, 'session_id']
                usr_sess_indices.append(idx)
            except:
                continue
            j = idx - 1
            while j >= 0:
                try:
                    new_user = df.at[j, 'user_id']
                    new_sess = df.at[j, 'session_id']
                    if new_user == a_user and new_sess == a_sess:
                        usr_sess_indices.append(j)
                        reference = df.at[j, 'reference']
                        if RepresentsInt(reference):
                            break
                        j -= 1
                    else:
                        if idx - j >= 2:
                            to_return += usr_sess_indices
                        break
                except:
                    j -= 1

        self.train_indices = sorted(list(set(train_index) & set(to_return)))
        self.test_indices = sorted(list(set(test_index) & set(to_return)))
        self.target_indices = sorted(list(set(tgt_indices) & set(to_return)))
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, [
            'user_id', 'session_id', 'platform', 'action_type', 'impressions'
        ]]

        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[(
            last_clk_removed_df.reference.str.isnumeric() == True)]

        df_item_clicks = (reference_rows.groupby(
            ["reference",
             "platform"]).size().reset_index(name="n_interactions_per_item"))
        df_item_clicks = df_item_clicks.rename(
            columns={'reference': 'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)

        df_city_clicks = (
            reference_rows.groupby('platform').size().reset_index(
                name="n_interactions_per_plat"))

        final_df = pd.merge(df_item_clicks,
                            df_city_clicks,
                            how='left',
                            on=['platform']).fillna(0)

        final_df['percentage_of_total_plat_inter'] = 0.0
        for t in zip(final_df.index, final_df.n_interactions_per_item,
                     final_df.n_interactions_per_plat):
            percentage_of_total_plat_inter = round((t[1] * 100.0) / t[2], 2)
            final_df.at[
                t[0],
                'percentage_of_total_plat_inter'] = percentage_of_total_plat_inter

        feature = final_df[[
            'platform', 'item_id', 'percentage_of_total_plat_inter'
        ]]
        clk_expanded = expand_impressions(clickout_rows)
        feature = pd.merge(clk_expanded,
                           feature,
                           how='left',
                           on=['platform', 'item_id']).fillna(0)
        feature = feature[[
            'user_id', 'session_id', 'item_id',
            'percentage_of_total_plat_inter'
        ]]

        return feature
    def extract_feature(self):

        tr = data.train_df(mode=self.mode, cluster=self.cluster)
        te = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([tr, te])
        idxs = sorted(find(df))
        mean_prices = []
        for i in tqdm(idxs):
            prices = list(map(int, df.at[i, 'prices'].split('|')))
            mean_prices.append(sum(prices)/len(prices))

        total = df.loc[idxs, ['user_id', 'session_id']]
        total['mean_price_clickout'] = mean_prices
        return total.reset_index(drop=True)
Пример #14
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # preprocess needed
        df = df.sort_values(
            by=['user_id', 'session_id', 'timestamp', 'step']).reset_index(
                drop=True)
        df = remove_last_part_of_clk_sessions(df)
        # compute number of interactions per session
        df_int = df[df.action_type == 'interaction item image'][[
            'user_id', 'session_id', 'timestamp', 'step', 'action_type'
        ]]
        feature = (df_int.groupby(
            ['user_id',
             'session_id']).size().reset_index(name='num_inter_item_image'))
        # compute session length
        sess_size = (df.groupby(['user_id', 'session_id'
                                 ]).size().reset_index(name='session_length'))
        # get clk rows and expand
        clickout_rows = df.loc[
            find(df), ['user_id', 'session_id', 'action_type', 'impressions']][
                df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows).drop(
            ['index', 'action_type'], 1)
        # merge
        final_feature = pd.merge(clk_expanded,
                                 feature,
                                 how='left',
                                 on=['user_id', 'session_id']).fillna(0)
        final_feature.num_inter_item_image = final_feature.num_inter_item_image.astype(
            int)
        final_feature = pd.merge(final_feature,
                                 sess_size,
                                 how='left',
                                 on=['user_id', 'session_id']).fillna(0)
        final_feature.session_length = final_feature.session_length.astype(int)
        # compute the percentage
        perc = []
        for t in tqdm(
                zip(final_feature.num_inter_item_image,
                    final_feature.session_length)):
            perc.append((t[0] * 100) / t[1])
        final_feature['perc_inter_item_image'] = perc

        return final_feature[[
            'user_id', 'session_id', 'item_id', 'perc_inter_item_image'
        ]]
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # preprocess needed
        df = df.sort_values(
            by=['user_id', 'session_id', 'timestamp', 'step']).reset_index(
                drop=True)
        df = remove_last_part_of_clk_sessions(df)

        sess_not_numeric_interactions = (
            df[df.reference.str.isnumeric() != True][[
                'user_id', 'session_id', 'timestamp', 'step'
            ]].groupby([
                'user_id', 'session_id'
            ]).size().reset_index(name='num_not_numeric_interactions'))

        sess_size = (df.groupby(['user_id', 'session_id'
                                 ]).size().reset_index(name='session_length'))

        clickout_rows = df.loc[
            find(df), ['user_id', 'session_id', 'action_type', 'impressions']][
                df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows).drop('index', 1)

        feature = pd.merge(clk_expanded,
                           sess_not_numeric_interactions,
                           how='left',
                           on=['user_id', 'session_id']).fillna(0)
        feature.num_not_numeric_interactions = feature.num_not_numeric_interactions.astype(
            int)
        feature = pd.merge(feature,
                           sess_size,
                           how='left',
                           on=['user_id', 'session_id']).fillna(0)
        feature.session_length = feature.session_length.astype(int)
        perc = []
        for t in tqdm(
                zip(feature.num_not_numeric_interactions,
                    feature.session_length)):
            perc.append((t[0] * 100) / t[1])
        feature['perc_not_numeric'] = perc

        return feature[[
            'user_id', 'session_id', 'item_id', 'perc_not_numeric'
        ]]
Пример #16
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        print('Sorting...')
        df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step'])
        # find indices of last clickouts
        print('Finding last clickouts...')
        last_clickout_indices = find(df)
        # get only last clickout rows to use the timestamp column
        print('Getting only last clk dataframe...')
        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'timestamp']]
        clickout_rows = clickout_rows.rename(
            columns={'timestamp': 'clk_timestamp'})
        # add the timestamp of last clk for each session as column
        print('Getting tmp...')
        tmp_df = df[[
            'user_id', 'session_id', 'step', 'action_type', 'timestamp'
        ]]
        tmp_df = pd.merge(tmp_df,
                          clickout_rows,
                          how='left',
                          on=['user_id', 'session_id']).fillna(0)
        tmp_df.clk_timestamp = tmp_df.clk_timestamp.astype(int)

        # subtracts the timestamps, puts 0 if there is no clickout in the session
        def func(t, t_clko):
            res = np.empty(len(t))
            for i in tqdm(range(len(t))):
                if t_clko[i] == 0:
                    res[i] = 0
                else:
                    res[i] = t_clko[i] - t[i]
            return res

        print('Subtracting timestamps...')
        tmp_df['diff'] = func(tmp_df.timestamp.values, tmp_df.clk_timestamp)
        tmp_df['diff'] = tmp_df['diff'].astype(int)
        tmp_df['index'] = tmp_df.index
        feature = tmp_df[['index', 'diff']]

        return feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        last_clickout_indices = find(df)
        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[
            (last_clk_removed_df.reference.str.isnumeric() == True)
            & (last_clk_removed_df.action_type == 'clickout item')][[
                'user_id', 'session_id', 'reference', 'impressions'
            ]]

        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists
                    for x in l]  # flatten multi dim list in 1-dim list :)
        c = dict(Counter(
            big_list))  # count occurrence of each item_id in the impressions

        df_times_in_impressions = pd.DataFrame.from_dict(
            c, orient='index', columns=['num_times_item_impressed'])
        df_times_in_impressions[
            'item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(
            columns=['item_id', 'num_times_item_impressed'])
        df_times_in_impressions = df_times_in_impressions.sort_values(
            by=['item_id']).reset_index(drop=True)

        feature = pd.merge(clk_expanded,
                           df_times_in_impressions,
                           how='left',
                           on=['item_id']).fillna(0)
        feature.num_times_item_impressed = feature.num_times_item_impressed.astype(
            int)

        return feature[[
            'user_id', 'session_id', 'item_id', 'num_times_item_impressed'
        ]]
Пример #18
0
    def extract_feature(self):
        self.current_directory = Path(__file__).absolute().parent
        self.data_dir = self.current_directory.joinpath('..', '..', 'stacking', self.mode)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self
                            .cluster)
        df = pd.concat([train, test])
        last_indices = find(df)

        # extract test scores
        self.train_dir = self.data_dir.joinpath('test')
        for file in glob.glob(str(self.train_dir) + '/xgboost*'):
            test_xgb = np.load(file)
            test_xgb = pd.DataFrame(test_xgb, columns=['index', 'item_recommendations', 'scores'])
            test_xgb = test_xgb.astype({'index': int})

        self.train_dir = self.data_dir.joinpath('train')
        for file in glob.glob(str(self.train_dir) + '/xgboost*'):
            train_xgb = np.load(file)
            train_xgb = pd.DataFrame(train_xgb, columns=['index', 'item_recommendations', 'scores'])
            train_xgb = train_xgb.astype({'index': int})

        xgb = pd.concat([train_xgb, test_xgb])

        # xgb_idx = list(xgb['index'])
        # print(f'Xgb indices are : {len(set(xgb_idx))}')
        # print(f'Last indices are : {len((last_indices))}')
        # common = set(xgb_idx) & set(last_indices)
        # print(f'In common : {len(common)}')
        xgb = xgb[xgb['index'].isin(last_indices)]

        xgb_idx = list(xgb['index'])

        t = assign_score(xgb, 'xgboost')
        t = t.sort_values(by='index')

        df['index'] = df.index.values
        df = df[['user_id', 'session_id','index']]
        df = pd.merge(t, df, how='left', on=['index'])
        return df[['user_id', 'session_id', 'item_id', 'score_xgboost']]
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get clickout rows
        clickout_rows = df.loc[find(df),
                               ['user_id', 'session_id', 'impressions']][
                                   df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows).drop(['index'], 1)
        # get position
        new_col = []
        curr_u = clk_expanded.loc[0, 'user_id']
        curr_s = clk_expanded.loc[0, 'session_id']
        pos = 0
        for t in tqdm(zip(clk_expanded.user_id, clk_expanded.session_id)):
            if t[0] == curr_u and t[1] == curr_s:
                pos += 1
            else:
                pos = 1
                curr_u = t[0]
                curr_s = t[1]
            new_col.append(pos)
        clk_expanded['position'] = new_col
        # get impression count for each session
        imp_count = (clk_expanded.groupby(
            ['user_id',
             'session_id']).size().reset_index(name='num_impressions'))
        # merge and compute percentage
        feature = pd.merge(clk_expanded,
                           imp_count,
                           how='left',
                           on=['user_id', 'session_id']).fillna(0)
        pos_perc = []
        for t in tqdm(zip(feature.position, feature.num_impressions)):
            pos_perc.append((t[0] * 100) / t[1])
        feature['impression_position_in_percentage'] = pos_perc

        return feature[[
            'user_id', 'session_id', 'item_id',
            'impression_position_in_percentage'
        ]]
Пример #20
0
    def extract_feature(self):

        tr = data.train_df(mode=self.mode, cluster=self.cluster)
        te = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([tr, te])
        idxs = sorted(find(df))
        means = []
        stds = []
        for i in tqdm(idxs):
            a_user = df.at[i, 'user_id']
            a_sess = df.at[i, 'session_id']
            a_time = df.at[i, 'timestamp']
            j = i - 1
            diffs = []
            while j >= 0:
                try:
                    new_user = df.at[j, 'user_id']
                    new_sess = df.at[j, 'session_id']
                    new_time = df.at[j, 'timestamp']
                    if new_user == a_user and new_sess == a_sess:
                        diffs.append(a_time - new_time)
                    else:
                        break
                    j -= 1
                    a_time = new_time
                except:
                    j -= 1
            if len(diffs) > 0:
                np_diffs = np.array(diffs)
                means.append(np.mean(np_diffs))
                stds.append(np.std(np_diffs))
            else:
                means.append(-1)
                stds.append(-1)

        total = df.loc[idxs, ['user_id', 'session_id']]
        total['mean_time_per_step'] = means
        total['frenzy_factor'] = stds
        return total.reset_index(drop=True)
Пример #21
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        o = ImpressionFeature(mode=self.mode)
        f = o.read_feature(True)  # get the accomodation's df
        feature_stars = f[[
            'item_id', 'properties1 Star', 'properties2 Star',
            'properties3 Star', 'properties4 Star', 'properties5 Star'
        ]]
        # remap the name
        feature_stars = feature_stars.rename(
            columns={
                'properties1 Star': '1',
                'properties2 Star': '2',
                'properties3 Star': '3',
                'properties4 Star': '4',
                'properties5 Star': '5'
            })
        # set default 0 Stars for those for which the feature is missing
        feature_stars['0'] = pd.Series(np.ones(len(feature_stars),
                                               dtype=np.uint8),
                                       index=feature_stars.index)
        feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1',
                                                '0']].idxmax(axis=1)
        feature_stars_restricted = feature_stars[['item_id', 'stars']]
        final_feature = pd.merge(clk_expanded,
                                 feature_stars_restricted,
                                 how='left',
                                 on=['item_id']).fillna(1)
        final_feature['stars'] = final_feature['stars'].astype(int)
        final_feature['stars'] = final_feature['stars'].replace(0, -1)
        return final_feature[['user_id', 'session_id', 'item_id', 'stars']]
Пример #22
0
def retrieve_real_test_indices(mode, cluster):
    test = data.test_df(mode, cluster)
    idxs = sorted(find(test))
    test_indices = []
    for i in idxs:
        to_append = [i]
        a_user = test.at[i, 'user_id']
        a_sess = test.at[i, 'session_id']
        j = i - 1
        while j >= test.index.values[0]:
            try:
                new_user = test.at[j, 'user_id']
                new_sess = test.at[j, 'session_id']
                if new_user == a_user and new_sess == a_sess:
                    to_append.append(j)
                    j -= 1
                else:
                    break
            except:
                j -= 1

        j = i + 1
        while j <= test.index.values[-1]:
            try:
                new_user = test.at[j, 'user_id']
                new_sess = test.at[j, 'session_id']
                if new_user == a_user and new_sess == a_sess:
                    to_append.append(j)
                    j += 1
                else:
                    break
            except:
                j += 1

        test_indices += to_append
    return sorted(test_indices)
Пример #23
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        # get clk rows
        last_clickout_indices = find(df)
        clickout_rows = df.loc[
            last_clickout_indices,
            ['user_id', 'session_id', 'impressions', 'prices']]
        clk_expanded = expand_impressions(clickout_rows).drop('index', 1)

        # open item metadata in one hot
        o = ImpressionFeature(mode=self.mode)
        df_accomodations = o.read_feature(True)

        # get the stars
        feature_stars = df_accomodations[[
            'item_id', 'properties1 Star', 'properties2 Star',
            'properties3 Star', 'properties4 Star', 'properties5 Star'
        ]]
        # remap the name
        feature_stars = feature_stars.rename(
            columns={
                'properties1 Star': '1',
                'properties2 Star': '2',
                'properties3 Star': '3',
                'properties4 Star': '4',
                'properties5 Star': '5'
            })
        # set default 0 Stars for those for which the feature is missing
        feature_stars['0'] = pd.Series(np.ones(len(feature_stars),
                                               dtype=np.uint8),
                                       index=feature_stars.index)
        feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1',
                                                '0']].idxmax(axis=1)
        feature_stars_restricted = feature_stars[['item_id', 'stars']]
        f_stars = pd.merge(clk_expanded,
                           feature_stars_restricted,
                           how='left',
                           on=['item_id'])
        f_stars['stars'] = f_stars['stars'].astype(int)

        # get the ratings
        f_ratings = df_accomodations[[
            'item_id',
            'propertiesExcellent Rating',
            'propertiesVery Good Rating',
            'propertiesGood Rating',
            'propertiesSatisfactory Rating',
        ]]
        f_ratings['propertiesNo Rating'] = pd.Series(np.ones(len(f_ratings),
                                                             dtype=np.uint8),
                                                     index=f_ratings.index)
        df = f_ratings.iloc[:, 1:]
        df['fake'] = pd.Series(np.zeros(len(df), dtype=np.uint8),
                               index=df.index)
        cols = df.columns.tolist()
        cols = [cols[-1]] + cols[:-1]
        df = df.reindex(columns=cols)
        dff = df.diff(axis=1).drop(['fake'], axis=1)
        dff = dff.astype(int)
        dff.columns = [5, 4, 3, 2, 1]
        f_ratings = f_ratings.drop(f_ratings.columns[1:], axis=1)
        f_ratings['rating'] = dff.idxmax(axis=1)
        f_ratings = pd.merge(f_ratings,
                             feature_stars_restricted,
                             how='left',
                             on=['item_id'])
        df_clk_rat_star = pd.merge(clk_expanded,
                                   f_ratings,
                                   how='left',
                                   on='item_id')

        # expand prices
        df_clk_rat_star.prices = df_clk_rat_star.prices.str.split('|')
        curr_user = '******'
        curr_sess = '_'
        pos = 0
        price_expanded = []
        for t in tqdm(
                zip(df_clk_rat_star.user_id, df_clk_rat_star.session_id,
                    df_clk_rat_star.prices)):
            #check if in session
            if curr_user != t[0] or curr_sess != t[1]:
                pos = 0
                curr_user = t[0]
                curr_sess = t[1]
            else:
                pos += 1
            price_expanded.append(t[2][pos])
        df_clk_rat_star['price'] = price_expanded
        df_clk_rat_star = df_clk_rat_star.drop(['prices'], 1)
        df_clk_rat_star.stars = df_clk_rat_star.stars.astype(int)

        # fills missing stars values with the mean
        avg = df_clk_rat_star[['user_id', 'session_id', 'stars']]
        avg = avg.loc[avg.stars !=
                      0]  # va calcolata la media solo sui non zero
        avg = pd.DataFrame(
            avg.groupby(['user_id', 'session_id'])['stars'].progress_apply(
                lambda x: int(x.sum() / x.size))).fillna(0)
        avg = avg.rename(columns={'stars': 'stars_avg'})
        avg.stars = avg.stars_avg.astype(int)
        no_stars = df_clk_rat_star.loc[df_clk_rat_star.stars == 0,
                                       ['user_id', 'session_id', 'item_id']]
        stars_filled = pd.merge(no_stars,
                                avg,
                                how='left',
                                on=['user_id', 'session_id']).fillna(0)
        stars_filled.stars = stars_filled.stars_avg.astype(int)
        df_clk_rat_star_filled = pd.merge(
            df_clk_rat_star,
            stars_filled,
            how='left',
            on=['user_id', 'session_id', 'item_id'])
        for t in zip(df_clk_rat_star_filled.stars,
                     df_clk_rat_star_filled.stars_avg,
                     df_clk_rat_star_filled.index):
            if t[0] == 0:
                df_clk_rat_star_filled.at[t[2], 'stars'] = t[1]
        df_clk_rat_star_filled = df_clk_rat_star_filled.drop('stars_avg', 1)

        # now fill missing values for rating
        avg = df_clk_rat_star_filled[['user_id', 'session_id', 'rating']]
        avg.rating = avg.rating.astype(int)
        avg = avg.loc[avg.rating !=
                      1]  # va calcolata la media solo sui non zero
        avg = pd.DataFrame(
            avg.groupby(['user_id', 'session_id'])['rating'].progress_apply(
                lambda x: int(x.sum() / x.size))).fillna(0)
        avg = avg.rename(columns={'rating': 'rating_avg'})
        avg.rating = avg.rating_avg.astype(int)
        no_rat = df_clk_rat_star.loc[df_clk_rat_star.rating == 1,
                                     ['user_id', 'session_id', 'item_id']]
        rat_filled = pd.merge(no_rat,
                              avg,
                              how='left',
                              on=['user_id', 'session_id']).fillna(0)
        rat_filled.rating = rat_filled.rating_avg.astype(int)
        df_clk_rat_star_rat_filled = pd.merge(
            df_clk_rat_star_filled,
            rat_filled,
            how='left',
            on=['user_id', 'session_id', 'item_id'])
        for t in zip(df_clk_rat_star_rat_filled.rating,
                     df_clk_rat_star_rat_filled.rating_avg,
                     df_clk_rat_star_rat_filled.index):
            if t[0] == 1:
                df_clk_rat_star_rat_filled.at[t[2], 'rating'] = t[1]
        df_clk_rat_star_rat_filled = df_clk_rat_star_rat_filled.drop(
            'rating_avg', 1)

        # add feature column
        new_col = []
        df_clk_rat_star_rat_filled.rating = df_clk_rat_star_rat_filled.rating.astype(
            int)
        df_clk_rat_star_rat_filled.stars = df_clk_rat_star_rat_filled.stars.astype(
            int)
        df_clk_rat_star_rat_filled.price = df_clk_rat_star_rat_filled.price.astype(
            int)

        for t in tqdm(
                zip(df_clk_rat_star_rat_filled.rating,
                    df_clk_rat_star_rat_filled.stars,
                    df_clk_rat_star_rat_filled.price)):
            new_col.append((1.5 * t[0] + t[1]) / t[2])
        df_clk_rat_star_rat_filled['price_quality'] = new_col
        final_feature = df_clk_rat_star_rat_filled[[
            'user_id', 'session_id', 'item_id', 'price_quality'
        ]]

        return final_feature
Пример #24
0
    def extract_feature(self):

        def extend_session_current_filters(y):

            x = y
            cf = x.current_filters

            if len(cf.dropna()) == 0:
                return x

            ind = cf.dropna().head(1).index.values[0]  # indice del primo cf diverso da nan

            while ind < cf.tail(1).index.values[0]:  # serve un while per la fine della sessione

                Nan_ind_found = False
                nan_ind = ind

                while Nan_ind_found == False:  # trovo la prima action che annulla i filtri

                    if nan_ind == cf.tail(1).index.values[0]:
                        return x

                    try:
                        if x.loc[nan_ind + 1].action_type in ['interaction item image', 'interaction item deals',
                                                              'interaction item info', 'interaction item rating',
                                                              'change of sort order']:
                            nan_ind = nan_ind + 1
                            Nan_ind_found = True
                    except:
                        print(x)

                    else:

                        nan_ind += 1
                # ora nan_ind è l'indice della prima action che annulla i filtri
                Nan_ind_last_found = False
                not_nan_ind = nan_ind
                # scorro e modifico i valori di cf finchè non trovo il primo indice che nn annulla cf: not_nan_ind
                while Nan_ind_last_found == False:

                    cf.loc[not_nan_ind] = cf.loc[not_nan_ind - 1]

                    if not_nan_ind == cf.tail(1).index.values[0]:
                        x.current_filters = cf
                        return x

                    if x.loc[not_nan_ind + 1].action_type in ['search for poi', 'search for destination',
                                                              'search for item', 'filter selection', 'clickout item']:

                        not_nan_ind = not_nan_ind + 1
                        Nan_ind_last_found = True

                    else:

                        not_nan_ind += 1
                # ora not_nan_ind è il primo indice che non annulla cf (corrisponde a ind) si riparte da capo e si continua finchè
                # non si arriva a fine sessione
                ind = not_nan_ind

            x.current_filters = cf
            return x

        list_of_sorting_filters_wout_pop = ['Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value',
                                            'Focus on Rating', 'Focus on Distance']

        list_of_sorting_filters = ['Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value',
                                   'Focus on Rating', 'Focus on Distance', 'Sort by Popularity']

        def mask_sorting(x):
            if np.isin(x, list_of_sorting_filters_wout_pop).any():
                return x
            else:
                return ['Sort by Popularity']

        start = time.time()
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # extend current_filters
        df.groupby(['user_id', 'session_id']).progress_apply(extend_session_current_filters)
        indices_last_clks = find(df)
        d = df.drop(indices_last_clks)
        reference_rows = d[d.reference.astype(str).str.isnumeric()]
        d_splitted = reference_rows.current_filters.progress_apply(lambda x: str(x).split('|'))
        md = d_splitted.progress_apply(mask_sorting)
        df_f = df.loc[md.index]
        df_ref = df_f.reference
        dict_ref_to_filters = dict(
            zip(df_ref.unique(), [dict(zip(list_of_sorting_filters, np.zeros(len(list_of_sorting_filters)))) \
                                  for i in range(len(df_ref.unique()))]))

        for index, row in tqdm(df_f.iterrows(), total=df_f.shape[0]):
            for i in md.loc[index]:
                if i in list_of_sorting_filters:
                    dict_ref_to_filters[row.reference][i] += 1
        df_feature = pd.DataFrame.from_dict(dict_ref_to_filters, orient='index')
        df_feature = df_feature.astype(int).reset_index().rename(index=str, columns={"index": "item_id"})
        set_of_not_clicked_items = set(data.accomodations_df().item_id) - set(df_feature.item_id)
        extension = pd.DataFrame(data=sorted([i for i in set_of_not_clicked_items]), columns=['item_id'])
        extd = df_feature.append(extension, ignore_index=True, sort=True)
        f = extd.fillna(0).reset_index().drop(columns=['index'])
        feature = f[np.insert(f.columns[:-1].values, 0, f.columns[-1])].astype(int)
        _time = time.time() - start
        elapsed = time.strftime('%Mm %Ss', time.gmtime(_time))
        print(f"elapsed in: {elapsed}")
        return feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        # get only non-last-clickout clickout rows
        last_clickout_indices = find(df)
        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[
            (last_clk_removed_df.reference.str.isnumeric() == True)
            & (last_clk_removed_df.action_type == 'clickout item')][[
                'user_id', 'session_id', 'reference', 'impressions'
            ]]

        # get the impressions
        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists
                    for x in l]  # convert multi-dim list in 1-dim list
        c = dict(
            Counter(big_list)
        )  # count occurence of each accomodation in the impression list

        # create df from dictonary: for each accomodation tells the number of times it appears in impressions
        df_times_in_impressions = pd.DataFrame.from_dict(
            c, orient='index', columns=['number_of_times_in_impr'])
        df_times_in_impressions[
            'item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(
            columns=['item_id', 'number_of_times_in_impr'])

        # get number of times an accomodation has been clicked
        df_item_clicks = (reference_rows.groupby(
            ["reference"]).size().reset_index(name="n_clickouts"))
        df_item_clicks = df_item_clicks.rename(
            columns={'reference': 'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)

        # merge the two df
        merged = pd.merge(df_times_in_impressions,
                          df_item_clicks,
                          how='left',
                          on=['item_id']).fillna(0)
        merged.n_clickouts = merged.n_clickouts.astype(int)
        merged['perc_click_appeared'] = round(
            (merged.n_clickouts * 100) / (merged.number_of_times_in_impr), 2)

        # create the feature for each item
        feature_per_item = merged[['item_id', 'perc_click_appeared']]

        # use the feature for each last clickout
        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_rows)
        final_feature = pd.merge(clk_expanded,
                                 feature_per_item,
                                 how='left',
                                 on=['item_id']).fillna(0)
        final_feature = final_feature[[
            'user_id', 'session_id', 'item_id', 'perc_click_appeared'
        ]]

        return final_feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get last clickout rows
        last_clickout_indices = find(df)
        clickout_rows = df.loc[
            last_clickout_indices,
            ['user_id', 'session_id', 'city', 'reference', 'impressions']][
                df.action_type == 'clickout item']
        # get reference rows WITH last clickout
        reference_rows = df[(df.reference.str.isnumeric() == True)
                            & (df.action_type == 'clickout item')]
        # compute popularity WITH last clickout
        df_item_clicks = (reference_rows.groupby(
            ["reference",
             "city"]).size().reset_index(name="n_interactions_per_item"))
        df_item_clicks = df_item_clicks.rename(
            columns={'reference': 'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)

        df_city_clicks = (reference_rows.groupby('city').size().reset_index(
            name="n_interactions_per_city"))
        # merge clickout rows expanded with the popularity dataframes
        merged_df = pd.merge(df_item_clicks,
                             df_city_clicks,
                             how='left',
                             on=['city']).fillna(0)
        clk_expanded = expand_impressions(clickout_rows)
        feature = pd.merge(clk_expanded,
                           merged_df,
                           how='left',
                           on=['item_id', 'city']).fillna(0)
        # compute the percentage of clicks per platfom
        new_col = []
        feature.reference = feature.reference.astype(int)
        feature.item_id = feature.item_id.astype(int)
        for t in tqdm(
                zip(feature.reference, feature.item_id,
                    feature.n_interactions_per_item,
                    feature.n_interactions_per_city)):
            if t[0] == t[1]:  # è quello cliccato
                if t[3] != 1:
                    percentage_of_total_city_clk = round(
                        ((t[2] - 1) * 100.0) / (t[3] - 1), 5)
                else:
                    percentage_of_total_city_clk = 0
            else:  # non è quello cliccato
                if 0 not in [t[2], t[3]] and t[3] != 1:
                    percentage_of_total_city_clk = round(
                        (t[2] * 100.0) / (t[3] - 1),
                        5)  # tolgo comunque il click per plat
                else:
                    percentage_of_total_city_clk = 0
            new_col.append(percentage_of_total_city_clk)
        feature['adj_percentage_of_total_city_clk'] = new_col
        feature.adj_percentage_of_total_city_clk = feature.adj_percentage_of_total_city_clk.astype(
            float)
        final_feature_reduced = feature[[
            'user_id', 'session_id', 'item_id',
            'adj_percentage_of_total_city_clk'
        ]]

        return final_feature_reduced
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # first step: get all the platforms
        platforms = sorted(df.platform.unique().tolist())
        # create df that for each plat will hold the feature vector
        df_plat_feature = pd.DataFrame(
            columns=['platform', 'properties_array'])
        df_plat_feature['platform'] = platforms
        # remove last clickouts and do some preprocessing
        last_indices = find(df)
        df_clickout = df[(df.reference.str.isnumeric() == True)
                         & (df['action_type'] == 'clickout item')][[
                             'reference', 'platform'
                         ]]
        df_clickout = df_clickout.rename(columns={'reference': 'item_id'})
        df_clickout.item_id = df_clickout.item_id.astype(int)
        # get the item metadata in one hot
        o = ImpressionFeature(mode=self.mode)
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop([
            'properties1 Star', 'properties2 Star', 'properties3 Star',
            'properties4 Star', 'properties5 Star'
        ], 1)
        # merge clickouts dataframe with the metadata
        df_clicks_properties = pd.merge(df_clickout,
                                        df_accomodations,
                                        how='left',
                                        on=['item_id'])
        # extract the one hot econded feature into a 1-dim numpy array
        array = df_accomodations.drop(['item_id'], axis=1).values
        # for each item append the features as numpy array
        df_item_features = pd.DataFrame(columns=['item_id', 'features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)
        # for each column compute the sum of all the clickout-rows' features
        new_col = []  # which will hold the platform feature vector
        for p in tqdm(platforms):
            df_clicks_properties_per_plat = df_clicks_properties[
                df_clicks_properties.platform == p]
            df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop(
                ['item_id', 'platform'], axis=1)
            df_sum = df_clicks_properties_per_plat.sum()
            # questo if serve perché ci sono plat che non compaiono nei clickout
            # per quelle metto un vettore di 0
            if df_clicks_properties_per_plat.shape[0] != 0:
                df_sum = df_sum.apply(
                    lambda x: x / df_clicks_properties_per_plat.shape[0])
                plat_feature = df_sum.values
            else:
                plat_feature = np.asarray(
                    [0] * df_clicks_properties_per_plat.shape[1])
            new_col.append(plat_feature)
        df_plat_feature['properties_array'] = new_col

        # now take the last clickout rows and expand on the impression list
        clickout_rows = df.loc[last_indices, [
            'user_id', 'session_id', 'platform', 'action_type', 'impressions'
        ]][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)
        clk_expanded = clk_expanded.drop(['index', 'action_type'], axis=1)
        # for each impression, add the feature vector of the platform and the feature vector of the impression
        clk_expanded_wt_plat_feat = pd.merge(clk_expanded,
                                             df_plat_feature,
                                             how='left',
                                             on=['platform'])
        final_feature = pd.merge(clk_expanded_wt_plat_feat,
                                 df_item_features,
                                 how='left',
                                 on=['item_id'])
        # compute the similarity between the impression's feature vector and the plat feature vector
        new_col = []
        if self.metric == 'cosine':
            shrink = 5  # TRY ME
            for t in tqdm(
                    zip(final_feature.properties_array,
                        final_feature.features_array)):
                new_col.append(
                    cosine_similarity(t[0].astype(np.double),
                                      t[1].astype(np.double), shrink))

        final_feature = final_feature[['user_id', 'session_id', 'item_id']]
        final_feature['adj_platform_features_similarity'] = new_col
        return final_feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        platforms = df['platform'].unique().tolist()
        df_plat_feature = pd.DataFrame(columns=['platform','properties_array'])
        df_plat_feature['platform'] = platforms
        last_indices = find(df)
        df_non_last_clk = df.drop(last_indices)
        df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','platform']]
        df_clickout = df_clickout.rename(columns={'reference':'item_id'})
        df_clickout = df_clickout.dropna() # remove NaNs
        df_clickout.item_id = df_clickout.item_id.astype(int)
        o = ImpressionFeature(mode=self.mode)
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1)

        df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id'])
        array = df_accomodations.drop(['item_id'],axis=1).values
        df_item_features = pd.DataFrame(columns=['item_id','features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)

        new_col = []
        for p in tqdm(platforms):
            df_clicks_properties_per_plat = df_clicks_properties[df_clicks_properties.platform == p]
            df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop(['item_id','platform'], axis=1)
            df_sum = df_clicks_properties_per_plat.sum()
            if df_clicks_properties_per_plat.shape[0] !=0: # questo vuol dire che appare almeno una volta la plat
                plat_feature = df_sum.values
            else:
                plat_feature = np.asarray([0]*df_clicks_properties_per_plat.shape[1])
            new_col.append(plat_feature)

        df_plat_feature['properties_array'] = new_col
        global_sum = df_clicks_properties.drop(['item_id','platform'],1)
        global_sum = global_sum.sum().tolist()

        df_plat_feature['global_properties'] = df_plat_feature.apply(lambda x: global_sum, axis=1)
        properties_globally_normalized = []
        for t in tqdm(zip(df_plat_feature.properties_array, df_plat_feature.global_properties)):
            properties_globally_normalized.append(np.asarray([x/y for x,y in zip(t[0],t[1])]))

        df_plat_feature['properties_globally_normalized'] = properties_globally_normalized
        df_plat_feature = df_plat_feature.drop(['properties_array','global_properties'],1)

        # ora prendo il dataframe coi clickout solito
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','platform','action_type','impressions']][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)

        clk_expanded = clk_expanded.drop(['index','action_type'], axis = 1)
        clk_expanded_wt_plat_feat = pd.merge(clk_expanded, df_plat_feature, how='left', on=['platform']).astype(object)
        clk_expanded_wt_plat_feat.item_id = clk_expanded_wt_plat_feat.item_id.astype(int)

        final_feature = pd.merge(clk_expanded_wt_plat_feat, df_item_features, how='left', on=['item_id'])
        new_col =[]
        shrink = 0 # TRY ME
        for t in tqdm(zip(final_feature.properties_globally_normalized, final_feature.features_array)):
            new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink))

        new_feature = final_feature[['user_id','session_id','item_id']]
        new_feature['platform_similarity_normalized'] = new_col

        return new_feature
Пример #29
0
    def extract_feature(self):
        def func(x):
            def extract_daytime(timestamp, platform):
                res = np.empty(len(timestamp), dtype='datetime64[s]')
                unique_platforms = x['platform'].unique()
                dict_row_platform = {
                    'AU': 3,
                    'CA': 11,
                    'RU': 1,
                    'BR': 1,
                    'US': 17
                }

                list_of_common_platforms = [
                    i for i in unique_platforms
                    if i not in dict_row_platform.keys()
                ]

                for i in list_of_common_platforms:
                    dict_row_platform[i] = 0

                dict_row_platform['GB'] = dict_row_platform.pop('UK')
                dict_row_platform['ET'] = dict_row_platform.pop('AA')

                austral_emisphere = [
                    'AU', 'MX', 'CL', 'AR', 'ID', 'NZ', 'EC', 'BR'
                ]

                for i in tqdm(range(len(timestamp))):
                    ts = timestamp[i]
                    p = platform[i]
                    if p == 'UK':
                        p = 'GB'
                    elif p == 'AA':
                        p = 'ET'

                    if p in austral_emisphere:
                        bool_amb = True

                    else:
                        bool_amb = False

                    zone = pytz.country_timezones(p)[dict_row_platform[p]]

                    timeznd = pd.to_datetime(ts).tz_localize(
                        zone, ambiguous=np.array(bool_amb))

                    res[i] = timeznd

                return res

            return extract_daytime(
                pd.to_datetime(x['timestamp'], unit='s', origin='unix').values,
                x['platform'].values).astype('datetime64[s]')

        def get_moment_in_the_day(x):
            if (0 <= x.hour) & (x.hour < 8):
                return 'N'
            elif (8 <= x.hour) & (x.hour < 13):
                return 'M'
            elif (13 <= x.hour) & (x.hour < 19):
                return 'A'
            elif (19 <= x.hour) & (x.hour < 24):
                return 'E'

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df_indices = find(pd.concat([train, test]))
        df = pd.concat([train, test]).loc[df_indices]

        df['day'] = func(df)
        df['moment'] = df['day'].progress_apply(
            lambda x: get_moment_in_the_day(x))
        df['day'] = df['day'].progress_apply(
            lambda x: pd.to_datetime(x).dayofweek)

        return df.drop(columns=[
            'action_type', 'reference', 'impressions', 'prices', 'city',
            'device', 'step', 'current_filters', 'timestamp', 'platform',
            'frequence'
        ])
    def extract_feature(self):
        def convert_and_add_pos(df):
            df_t = expand_impressions(df)
            df['index'] = df.index
            df = pd.merge(df_t,
                          df,
                          how='left',
                          on=['index', 'user_id', 'session_id', 'action_type'],
                          suffixes=('', '_y'))
            df = df.drop('time_per_impression_y', axis=1)
            df['item_pos'] = df.apply(
                lambda x: (x['impression_list'].index(str(x['item_id']))) + 1,
                axis=1)
            df = df.drop(['impression_list', 'index'], axis=1)
            return df

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        df = df.sort_values(['user_id', 'session_id', 'timestamp',
                             'step']).reset_index(drop=True)
        df['time_per_impression'] = df['timestamp'].shift(-1) - df['timestamp']

        last_clickout_indices = find(df)
        clickout_rows = df.loc[
            last_clickout_indices,
            ['user_id', 'session_id', 'action_type', 'impressions']][
                df.action_type == 'clickout item']
        clickout_rows['impression_list'] = clickout_rows.impressions.str.split(
            '|')
        clickout_rows['time_per_impression'] = [
            [0] * 25 for x in range(len(clickout_rows.index))
        ]

        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[
            last_clk_removed_df.reference.astype(str).str.isnumeric()]
        reference_rows = reference_rows.drop('action_type', axis=1)
        reference_rows = reference_rows[
            reference_rows.user_id.isin(clickout_rows.user_id)
            & reference_rows.session_id.isin(clickout_rows.session_id)]

        j = 0
        clickout_indices = clickout_rows.index.values
        clickout_user = clickout_rows.at[clickout_indices[j], 'user_id']
        clickout_session = clickout_rows.at[clickout_indices[j], 'session_id']
        for t in tqdm(
                zip(reference_rows.index, reference_rows.time_per_impression,
                    reference_rows.user_id, reference_rows.session_id,
                    reference_rows.reference)):
            if t[0] >= clickout_indices[-1]:
                break
            # find the next clickout index
            while t[0] > clickout_indices[j]:
                j += 1
                clickout_user = clickout_rows.at[clickout_indices[j],
                                                 'user_id']
                clickout_session = clickout_rows.at[clickout_indices[j],
                                                    'session_id']

            # check if row and next_clickout are in the same session
            if t[2] == clickout_user and t[3] == clickout_session:
                try:
                    ref_idx = clickout_rows.at[clickout_indices[j],
                                               'impression_list'].index(t[4])
                    feature_list = clickout_rows.at[clickout_indices[j],
                                                    'time_per_impression']
                    feature_list[ref_idx] += t[1]
                except:
                    pass

        final_df = convert_and_add_pos(clickout_rows)
        final_df['impression_time'] = final_df.apply(
            lambda x: list(x['time_per_impression'])[int(x['item_pos']) - 1],
            axis=1)
        final_df = final_df[[
            'user_id', 'session_id', 'item_id', 'impression_time'
        ]]
        final_df['impression_time'] = final_df['impression_time'].astype(int)
        return final_df