Пример #1
0
 def extract_feature(self):
     train = data.train_df(mode=self.mode, cluster=self.cluster)
     test = data.test_df(mode=self.mode, cluster=self.cluster)
     df = pd.concat([train, test])
     # remove last clks and last part of session
     new_df = remove_last_part_of_clk_sessions(df)
     new_df = new_df.drop(find(new_df))
     no_last_clks_numeric = new_df[
         new_df.reference.str.isnumeric() == True][[
             'user_id', 'session_id', 'action_type', 'reference'
         ]]
     # we want to make it fast, avoid any loops...
     # simply drop duplicates and mantain last occurence
     # of the tuple user-session-item :D
     last_actions = no_last_clks_numeric.drop_duplicates(
         ['user_id', 'session_id', 'reference'], keep='last')
     last_actions = last_actions.rename(
         columns={
             'reference': 'item_id',
             'action_type': 'last_action_involving_impression'
         })
     last_actions.item_id = last_actions.item_id.astype(int)
     # get last clickouts and expand
     last_clk = df.loc[find(df)]
     clk_expanded = expand_impressions(last_clk)[[
         'user_id', 'session_id', 'item_id'
     ]]
     # now simply merge and fill NaNs with 'no_action' as in the original feature
     feature = pd.merge(clk_expanded,
                        last_actions,
                        how='left',
                        on=['user_id', 'session_id', 'item_id'])
     feature.last_action_involving_impression = feature.last_action_involving_impression.astype(
         object).fillna('no_action')
     return feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','action_type','impressions']]
        reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')]

        df_item_clicks = (
            reference_rows
            .groupby(["reference"])
            .size()
            .reset_index(name="n_interactions_per_item")
        )
        df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)
        #df_item_clicks

        clk_expanded = expand_impressions(clickout_rows)
        final_feature = pd.merge(clk_expanded, df_item_clicks, how='left', on=['item_id']).fillna(0)
        final_feature.n_interactions_per_item = final_feature.n_interactions_per_item.astype(int)
        final_feature = final_feature.drop(['index'], axis=1)

        final_feature.reference = final_feature.reference.astype(int)
        new_column = []
        for t in zip(final_feature.item_id, final_feature.reference, final_feature.n_interactions_per_item):
            if t[0] == t[1]:
                new_column.append(int(t[2]-1))
            else:
                new_column.append(int(t[2]))
        final_feature['personalized_popularity'] = new_column

        final_feature_reduced = final_feature[['user_id','session_id','item_id','personalized_popularity']]

        return final_feature_reduced
Пример #3
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        temp = df.fillna('0')
        idxs_click = sorted(find_last_clickout_indices(temp))
        idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index

        count = 0
        last_click = idxs_click[0]

        impr_features = {}
        impr_feature = []
        for i in tqdm(sorted(idxs_numeric_reference)):
            if i == last_click:
                impressions = list(map(int, temp.at[i, 'impressions'].split('|')))
                click_timestamp = temp.at[i, 'timestamp']
                click_step = temp.at[i, 'step']
                for impr in impressions:
                    if impr not in impr_features:
                        impr_feature.append({'num_interactions_impr': 0, 'step_from_last_interaction': -1,
                                             'timestamp_from_last_interaction': -1,
                                             'last_action_type_with_impr': 'None'})
                    else:
                        impr_features[impr]['timestamp_from_last_interaction'] = click_timestamp - impr_features[impr][
                            'timestamp_from_last_interaction']
                        impr_features[impr]['step_from_last_interaction'] = click_step - impr_features[impr][
                            'step_from_last_interaction']
                        impr_feature.append(impr_features[impr])
                impr_features = {}
                count += 1
                if count < len(idxs_click):
                    last_click = idxs_click[count]
                continue
            ref = int(temp.at[i, 'reference'])
            if ref in impr_features:
                impr_features[ref]['num_interactions_impr'] += 1
                impr_features[ref]['step_from_last_interaction'] = df.at[i, 'step']
                impr_features[ref]['timestamp_from_last_interaction'] = df.at[i, 'timestamp']
                impr_features[ref]['last_action_type_with_impr'] = df.at[i, 'action_type']
            else:
                impr_features[ref] = {'num_interactions_impr': 1, 'step_from_last_interaction': df.at[i, 'step'],
                                      'timestamp_from_last_interaction': df.at[i, 'timestamp'],
                                      'last_action_type_with_impr': df.at[i, 'action_type']}

        final_df = expand_impressions(temp[['user_id', 'session_id', 'impressions']].loc[idxs_click])
        print(len(final_df))
        print(len(impr_feature))
        final_df['dict'] = impr_feature

        features_df = pd.DataFrame(final_df.progress_apply(lambda x: tuple(x['dict'].values()), axis=1).tolist(),
                                   columns=list(final_df.iloc[0].dict.keys()))
        final_df_ = pd.concat([final_df, features_df], axis=1).drop('dict', axis=1)
        final_df_ = final_df_.drop(['num_interactions_impr', 'last_action_type_with_impr'], axis=1)
        return final_df_
Пример #4
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get all the cities
        cities = df['city'].unique().tolist()
        # get clickout rows (WITHOUT last clk)
        last_indices = find(df)
        df_non_last_clk = df.drop(last_indices)
        df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','city']]
        df_clickout = df_clickout.rename(columns={'reference':'item_id'})
        df_clickout = df_clickout.dropna() # remove NaNs, that should not be there anywayss
        df_clickout.item_id = df_clickout.item_id.astype(int)
        # open impressions df
        o = ImpressionFeature(mode='small')
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1)
        # get all clicks properties
        df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id'])
        df_clicks_properties = df_clicks_properties.sort_values(by=['city'])
        df_clicks_properties = df_clicks_properties.drop('item_id',1)
        # sum all properties per city
        grouped_by_city = df_clicks_properties.groupby('city').sum()
        # create df with city:array_of_features
        df_city_features = pd.DataFrame(columns=['city','properties_array'])
        df_city_features.city = grouped_by_city.index
        df_city_features.properties_array = grouped_by_city.values.tolist()
        # now take last clk df
        clickout_rows = df.loc[last_indices,
                       ['user_id','session_id','city','action_type','impressions']][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)
        clk_expanded_wt_city_feat = pd.merge(clk_expanded, df_city_features, how='left', on=['city'])
        # create df with item:array_of_features
        array = df_accomodations.drop(['item_id'],axis=1).values
        df_item_features = pd.DataFrame(columns=['item_id','features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)
        final_feature = pd.merge(clk_expanded_wt_city_feat, df_item_features, how='left', on=['item_id'])
        for n in tqdm(final_feature[final_feature['properties_array'].isnull()].index.tolist()):
            final_feature.at[n,'properties_array'] = [0]*152
        # cast list to numpy array to use the cosine (it's written for doubles)
        final_feature.properties_array = final_feature.properties_array.progress_apply(lambda x: np.asarray(x))
        # create new column
        new_col =[]
        if self.metric == 'cosine':
            shrink = 0 # TRY ME
            for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)):
                new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink))
        if self.metric == 'euclidean':
            for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)):
                new_col.append(np.linalg.norm(t[0]-t[1]))
        # final feature
        new_feature = final_feature[['user_id','session_id','item_id']]
        new_feature['city_similarity'] = new_col

        return new_feature
    def extract_feature(self):

        def get_pos(item, rec):
            res = np.empty(item.shape)
            for i in tqdm(range(len(item))):
                if str(item[i]) in rec[i]:
                    res[i] = rec[i].index(str(item[i])) + 1
                else:
                    res[i] = -1
            return res.astype(int)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        all_clk_rows = df[df.reference.str.isnumeric()==True][df.action_type == 'clickout item']
        all_clk_rows = all_clk_rows [['user_id','session_id','reference','impressions']]

        all_clk_rows.impressions = all_clk_rows.impressions.str.split('|')
        pos_col = get_pos(all_clk_rows.reference.values,all_clk_rows.impressions.values)
        all_clk_rows = all_clk_rows.drop('impressions',1)
        all_clk_rows['position'] = pos_col
        all_clk_rows_after_1 = all_clk_rows[all_clk_rows.position>1]

        df_clicks_after_1 = (
            all_clk_rows_after_1
            .groupby(["reference"])
            .size()
            .reset_index(name="n_clicks_per_item")
        )
        df_clicks_after_1.reference = df_clicks_after_1.reference.astype(int)
        df_clicks_after_1 = df_clicks_after_1.rename(columns={'reference':'item_id'})

        last_clk_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']]
        last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|')
        clk_expanded = expand_impressions(last_clk_rows)
        clk_expanded = clk_expanded.drop('index',1)

        pos_col = get_pos(clk_expanded.item_id.values,clk_expanded.imp_list.values)
        clk_expanded['position'] = pos_col
        clk_expanded = clk_expanded.drop('imp_list',1)

        merged = pd.merge(clk_expanded, df_clicks_after_1, how='left',on='item_id').fillna(0)
        new_col = []
        merged.item_id = merged.item_id.astype(int)
        merged.reference = merged.reference.astype(int)
        for t in tqdm(zip(merged.item_id, merged.reference, merged.position, merged.n_clicks_per_item)):
            if t[0]==t[1] and t[2]>1:
                new_col.append(int(t[3]-1))
            else:
                new_col.append(int(t[3]))

        merged['n_clicks_after_first_pos'] = new_col
        feature = merged[['user_id','session_id','item_id','n_clicks_after_first_pos']]
        return feature
Пример #6
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get ALL clickouts
        reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type =='clickout item')][['user_id','session_id','reference','impressions']]
        # get last clickout
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        # get the impressions
        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists for x in l]
        c = dict(Counter(big_list))

        df_times_in_impressions = pd.DataFrame.from_dict(c, orient='index',columns=['number_of_times_in_impr'])
        df_times_in_impressions['item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(columns = ['item_id', 'number_of_times_in_impr'])

        feature_times_per_imp = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0)
        feature_times_per_imp.number_of_times_in_impr = feature_times_per_imp.number_of_times_in_impr.astype(int)
        feature_times_per_imp = feature_times_per_imp[['user_id', 'session_id','item_id','number_of_times_in_impr']]

        df_item_clicks = (
            reference_rows
            .groupby(["reference"])
            .size()
            .reset_index(name="n_clickouts")
        )
        df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)
        merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0)
        merged.n_clickouts = merged.n_clickouts.astype(int)

        final_feature = pd.merge(clk_expanded, merged, how='left', on=['item_id']).fillna(0)
        new_col = []
        final_feature.reference = final_feature.reference.astype(int)
        final_feature.item_id = final_feature.item_id.astype(int)
        for t in tqdm(zip(final_feature.reference, final_feature.item_id,
                     final_feature.number_of_times_in_impr, final_feature.n_clickouts)):
            if t[0]==t[1]: # stessa reference, quindi decremento di 1 sia #click che #imp
                if t[2]!=1:
                    new_col.append(round(((t[3]-1)*100)/(t[2]-1),5))
                else:
                    new_col.append(0)
            else:
                if 0 not in [t[2],t[3]] and t[2]!=1:
                    new_col.append(round(((t[3])*100)/(t[2]-1),5))
                else:
                    new_col.append(0)
        final_feature['adj_perc_click_appeared'] = new_col
        final_feature = final_feature[['user_id','session_id','item_id','adj_perc_click_appeared']]

        return final_feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, [
            'user_id', 'session_id', 'platform', 'action_type', 'impressions'
        ]]

        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[(
            last_clk_removed_df.reference.str.isnumeric() == True)]

        df_item_clicks = (reference_rows.groupby(
            ["reference",
             "platform"]).size().reset_index(name="n_interactions_per_item"))
        df_item_clicks = df_item_clicks.rename(
            columns={'reference': 'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)

        df_city_clicks = (
            reference_rows.groupby('platform').size().reset_index(
                name="n_interactions_per_plat"))

        final_df = pd.merge(df_item_clicks,
                            df_city_clicks,
                            how='left',
                            on=['platform']).fillna(0)

        final_df['percentage_of_total_plat_inter'] = 0.0
        for t in zip(final_df.index, final_df.n_interactions_per_item,
                     final_df.n_interactions_per_plat):
            percentage_of_total_plat_inter = round((t[1] * 100.0) / t[2], 2)
            final_df.at[
                t[0],
                'percentage_of_total_plat_inter'] = percentage_of_total_plat_inter

        feature = final_df[[
            'platform', 'item_id', 'percentage_of_total_plat_inter'
        ]]
        clk_expanded = expand_impressions(clickout_rows)
        feature = pd.merge(clk_expanded,
                           feature,
                           how='left',
                           on=['platform', 'item_id']).fillna(0)
        feature = feature[[
            'user_id', 'session_id', 'item_id',
            'percentage_of_total_plat_inter'
        ]]

        return feature
 def convert_and_add_pos(df):
     df_t = expand_impressions(df)
     df['index'] = df.index
     df = pd.merge(df_t,
                   df,
                   how='left',
                   on=['index', 'user_id', 'session_id', 'action_type'],
                   suffixes=('', '_y'))
     df = df.drop('time_per_impression_y', axis=1)
     df['item_pos'] = df.apply(
         lambda x: (x['impression_list'].index(str(x['item_id']))) + 1,
         axis=1)
     df = df.drop(['impression_list', 'index'], axis=1)
     return df
Пример #9
0
def merge_features_tf_cv(mode, cluster, features_array):
    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs = find_last_clickout_indices(full_df)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    # expand the impression as rows
    print('expand the impression')
    click_df = expand_impressions(click_df)[['user_id', 'session_id', 'item_id', 'index']]
    click_df['dummy_step'] = np.arange(len(click_df))

    # do the join
    print('join with the features')
    print(f'train_shape: {click_df.shape}\n')
    context_features_id = []
    for f in features_array:
        if type(f) == tuple:
            feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1])
        else:
            feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True)
        print(f'columns of the feature:\n {feature.columns}')
        print(f'NaN values are: {feature.isnull().values.sum()}')
        # if there are none fill it with -1
        feature.fillna(0, inplace=True)
        # check if it is a feature of the impression
        if 'item_id' not in feature.columns:
            for i in range(click_df.shape[1] - 6 + 1, click_df.shape[1] - 6 + 1 + feature.shape[1] - 2, 1):
                context_features_id.append(str(i))
        print(f'session features names:{context_features_id}')
        print(f'shape of feature: {feature.shape}')
        print(f'len of feature:{len(feature)}')
        click_df = click_df.merge(feature)
        print(f'train_shape: {click_df.shape}\n ')

    print('sorting by index and step...')
    # sort the dataframes
    click_df.sort_values(['index', 'dummy_step'], inplace=True)
    click_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return click_df, np.array(context_features_id)
Пример #10
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # preprocess needed
        df = df.sort_values(
            by=['user_id', 'session_id', 'timestamp', 'step']).reset_index(
                drop=True)
        df = remove_last_part_of_clk_sessions(df)
        # compute number of interactions per session
        df_int = df[df.action_type == 'interaction item image'][[
            'user_id', 'session_id', 'timestamp', 'step', 'action_type'
        ]]
        feature = (df_int.groupby(
            ['user_id',
             'session_id']).size().reset_index(name='num_inter_item_image'))
        # compute session length
        sess_size = (df.groupby(['user_id', 'session_id'
                                 ]).size().reset_index(name='session_length'))
        # get clk rows and expand
        clickout_rows = df.loc[
            find(df), ['user_id', 'session_id', 'action_type', 'impressions']][
                df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows).drop(
            ['index', 'action_type'], 1)
        # merge
        final_feature = pd.merge(clk_expanded,
                                 feature,
                                 how='left',
                                 on=['user_id', 'session_id']).fillna(0)
        final_feature.num_inter_item_image = final_feature.num_inter_item_image.astype(
            int)
        final_feature = pd.merge(final_feature,
                                 sess_size,
                                 how='left',
                                 on=['user_id', 'session_id']).fillna(0)
        final_feature.session_length = final_feature.session_length.astype(int)
        # compute the percentage
        perc = []
        for t in tqdm(
                zip(final_feature.num_inter_item_image,
                    final_feature.session_length)):
            perc.append((t[0] * 100) / t[1])
        final_feature['perc_inter_item_image'] = perc

        return final_feature[[
            'user_id', 'session_id', 'item_id', 'perc_inter_item_image'
        ]]
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # preprocess needed
        df = df.sort_values(
            by=['user_id', 'session_id', 'timestamp', 'step']).reset_index(
                drop=True)
        df = remove_last_part_of_clk_sessions(df)

        sess_not_numeric_interactions = (
            df[df.reference.str.isnumeric() != True][[
                'user_id', 'session_id', 'timestamp', 'step'
            ]].groupby([
                'user_id', 'session_id'
            ]).size().reset_index(name='num_not_numeric_interactions'))

        sess_size = (df.groupby(['user_id', 'session_id'
                                 ]).size().reset_index(name='session_length'))

        clickout_rows = df.loc[
            find(df), ['user_id', 'session_id', 'action_type', 'impressions']][
                df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows).drop('index', 1)

        feature = pd.merge(clk_expanded,
                           sess_not_numeric_interactions,
                           how='left',
                           on=['user_id', 'session_id']).fillna(0)
        feature.num_not_numeric_interactions = feature.num_not_numeric_interactions.astype(
            int)
        feature = pd.merge(feature,
                           sess_size,
                           how='left',
                           on=['user_id', 'session_id']).fillna(0)
        feature.session_length = feature.session_length.astype(int)
        perc = []
        for t in tqdm(
                zip(feature.num_not_numeric_interactions,
                    feature.session_length)):
            perc.append((t[0] * 100) / t[1])
        feature['perc_not_numeric'] = perc

        return feature[[
            'user_id', 'session_id', 'item_id', 'perc_not_numeric'
        ]]
Пример #12
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        if self.mode in ['small', 'local']:
            print('reinserting clickout')
            test = test.groupby(['session_id',
                                 'user_id']).progress_apply(_reinsert_clickout)
        df = pd.concat([train, test])
        idxs_click = find_last_clickout_indices(df)
        df = df.loc[idxs_click][[
            'user_id', 'session_id', 'reference', 'impressions'
        ]]
        df = expand_impressions(df)
        df['label'] = (df['item_id'] == df['reference'].astype('float')) * 1
        df.drop(['index', 'reference'], axis=1, inplace=True)

        print(df)
        return df
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        last_clickout_indices = find(df)
        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[
            (last_clk_removed_df.reference.str.isnumeric() == True)
            & (last_clk_removed_df.action_type == 'clickout item')][[
                'user_id', 'session_id', 'reference', 'impressions'
            ]]

        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists
                    for x in l]  # flatten multi dim list in 1-dim list :)
        c = dict(Counter(
            big_list))  # count occurrence of each item_id in the impressions

        df_times_in_impressions = pd.DataFrame.from_dict(
            c, orient='index', columns=['num_times_item_impressed'])
        df_times_in_impressions[
            'item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(
            columns=['item_id', 'num_times_item_impressed'])
        df_times_in_impressions = df_times_in_impressions.sort_values(
            by=['item_id']).reset_index(drop=True)

        feature = pd.merge(clk_expanded,
                           df_times_in_impressions,
                           how='left',
                           on=['item_id']).fillna(0)
        feature.num_times_item_impressed = feature.num_times_item_impressed.astype(
            int)

        return feature[[
            'user_id', 'session_id', 'item_id', 'num_times_item_impressed'
        ]]
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get clickout rows
        clickout_rows = df.loc[find(df),
                               ['user_id', 'session_id', 'impressions']][
                                   df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows).drop(['index'], 1)
        # get position
        new_col = []
        curr_u = clk_expanded.loc[0, 'user_id']
        curr_s = clk_expanded.loc[0, 'session_id']
        pos = 0
        for t in tqdm(zip(clk_expanded.user_id, clk_expanded.session_id)):
            if t[0] == curr_u and t[1] == curr_s:
                pos += 1
            else:
                pos = 1
                curr_u = t[0]
                curr_s = t[1]
            new_col.append(pos)
        clk_expanded['position'] = new_col
        # get impression count for each session
        imp_count = (clk_expanded.groupby(
            ['user_id',
             'session_id']).size().reset_index(name='num_impressions'))
        # merge and compute percentage
        feature = pd.merge(clk_expanded,
                           imp_count,
                           how='left',
                           on=['user_id', 'session_id']).fillna(0)
        pos_perc = []
        for t in tqdm(zip(feature.position, feature.num_impressions)):
            pos_perc.append((t[0] * 100) / t[1])
        feature['impression_position_in_percentage'] = pos_perc

        return feature[[
            'user_id', 'session_id', 'item_id',
            'impression_position_in_percentage'
        ]]
Пример #15
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        o = ImpressionFeature(mode=self.mode)
        f = o.read_feature(True)  # get the accomodation's df
        feature_stars = f[[
            'item_id', 'properties1 Star', 'properties2 Star',
            'properties3 Star', 'properties4 Star', 'properties5 Star'
        ]]
        # remap the name
        feature_stars = feature_stars.rename(
            columns={
                'properties1 Star': '1',
                'properties2 Star': '2',
                'properties3 Star': '3',
                'properties4 Star': '4',
                'properties5 Star': '5'
            })
        # set default 0 Stars for those for which the feature is missing
        feature_stars['0'] = pd.Series(np.ones(len(feature_stars),
                                               dtype=np.uint8),
                                       index=feature_stars.index)
        feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1',
                                                '0']].idxmax(axis=1)
        feature_stars_restricted = feature_stars[['item_id', 'stars']]
        final_feature = pd.merge(clk_expanded,
                                 feature_stars_restricted,
                                 how='left',
                                 on=['item_id']).fillna(1)
        final_feature['stars'] = final_feature['stars'].astype(int)
        final_feature['stars'] = final_feature['stars'].replace(0, -1)
        return final_feature[['user_id', 'session_id', 'item_id', 'stars']]
Пример #16
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        idxs_click = find_last_clickout_indices(df)
        df = df.loc[idxs_click][['user_id', 'session_id', 'impressions']]
        df = expand_impressions(df)
        # initialize the session id
        session_id = ''
        count = 1
        impression_position = []
        for i in tqdm(df.index):
            c_session = df.at[i, 'session_id']
            if c_session != session_id:
                session_id = c_session
                count = 1
            impression_position.append(count)
            count += 1
        df['impression_position'] = impression_position
        df['impression_position'] = pd.to_numeric(df['impression_position'])
        df.drop('index', axis=1, inplace=True)

        return df
def merge_features(mode,
                   cluster,
                   features_array,
                   onehot=True,
                   merge_kind='inner',
                   create_not_existing_features=True,
                   multithread=False):
    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs = find_last_clickout_indices(full_df)
    last_click_idxs = sorted(last_click_idxs)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    print('retrieve vali_idxs')
    # if the mode is full we don't have the validation if the mode is small or local the validation is performed
    # on the target indices

    vali_test_idxs = data.target_indices(mode, cluster)

    # construct the validation train and test df_base
    print('construct test and vali df')
    validation_test_df = click_df.loc[vali_test_idxs]

    all_idxs = click_df.index.values

    # find the differences
    print('construct train df')
    train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True)
    train_df = click_df.loc[train_idxs]

    # expand the impression as rows
    print('expand the impression')
    train_df = expand_impressions(train_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    train_df['dummy_step'] = np.arange(len(train_df))
    validation_test_df = expand_impressions(validation_test_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    validation_test_df['dummy_step'] = np.arange(len(validation_test_df))

    if not multithread:
        train_df, validation_test_df = actual_merge_one_thread(train_df, validation_test_df, features_array, \
                                                                    mode, cluster, create_not_existing_features, merge_kind, onehot)
    else:
        train_df, validation_test_df = actual_merge_multithread(train_df, validation_test_df, features_array, \
                                                                    mode, cluster, create_not_existing_features, merge_kind, onehot)

    print('sorting by index and step...')
    # sort the dataframes
    train_df.sort_values(['index', 'dummy_step'], inplace=True)
    train_df.drop('dummy_step', axis=1, inplace=True)
    validation_test_df.sort_values(['index', 'dummy_step'], inplace=True)
    validation_test_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return train_df, validation_test_df, train_idxs, vali_test_idxs
Пример #18
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        # get clk rows
        last_clickout_indices = find(df)
        clickout_rows = df.loc[
            last_clickout_indices,
            ['user_id', 'session_id', 'impressions', 'prices']]
        clk_expanded = expand_impressions(clickout_rows).drop('index', 1)

        # open item metadata in one hot
        o = ImpressionFeature(mode=self.mode)
        df_accomodations = o.read_feature(True)

        # get the stars
        feature_stars = df_accomodations[[
            'item_id', 'properties1 Star', 'properties2 Star',
            'properties3 Star', 'properties4 Star', 'properties5 Star'
        ]]
        # remap the name
        feature_stars = feature_stars.rename(
            columns={
                'properties1 Star': '1',
                'properties2 Star': '2',
                'properties3 Star': '3',
                'properties4 Star': '4',
                'properties5 Star': '5'
            })
        # set default 0 Stars for those for which the feature is missing
        feature_stars['0'] = pd.Series(np.ones(len(feature_stars),
                                               dtype=np.uint8),
                                       index=feature_stars.index)
        feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1',
                                                '0']].idxmax(axis=1)
        feature_stars_restricted = feature_stars[['item_id', 'stars']]
        f_stars = pd.merge(clk_expanded,
                           feature_stars_restricted,
                           how='left',
                           on=['item_id'])
        f_stars['stars'] = f_stars['stars'].astype(int)

        # get the ratings
        f_ratings = df_accomodations[[
            'item_id',
            'propertiesExcellent Rating',
            'propertiesVery Good Rating',
            'propertiesGood Rating',
            'propertiesSatisfactory Rating',
        ]]
        f_ratings['propertiesNo Rating'] = pd.Series(np.ones(len(f_ratings),
                                                             dtype=np.uint8),
                                                     index=f_ratings.index)
        df = f_ratings.iloc[:, 1:]
        df['fake'] = pd.Series(np.zeros(len(df), dtype=np.uint8),
                               index=df.index)
        cols = df.columns.tolist()
        cols = [cols[-1]] + cols[:-1]
        df = df.reindex(columns=cols)
        dff = df.diff(axis=1).drop(['fake'], axis=1)
        dff = dff.astype(int)
        dff.columns = [5, 4, 3, 2, 1]
        f_ratings = f_ratings.drop(f_ratings.columns[1:], axis=1)
        f_ratings['rating'] = dff.idxmax(axis=1)
        f_ratings = pd.merge(f_ratings,
                             feature_stars_restricted,
                             how='left',
                             on=['item_id'])
        df_clk_rat_star = pd.merge(clk_expanded,
                                   f_ratings,
                                   how='left',
                                   on='item_id')

        # expand prices
        df_clk_rat_star.prices = df_clk_rat_star.prices.str.split('|')
        curr_user = '******'
        curr_sess = '_'
        pos = 0
        price_expanded = []
        for t in tqdm(
                zip(df_clk_rat_star.user_id, df_clk_rat_star.session_id,
                    df_clk_rat_star.prices)):
            #check if in session
            if curr_user != t[0] or curr_sess != t[1]:
                pos = 0
                curr_user = t[0]
                curr_sess = t[1]
            else:
                pos += 1
            price_expanded.append(t[2][pos])
        df_clk_rat_star['price'] = price_expanded
        df_clk_rat_star = df_clk_rat_star.drop(['prices'], 1)
        df_clk_rat_star.stars = df_clk_rat_star.stars.astype(int)

        # fills missing stars values with the mean
        avg = df_clk_rat_star[['user_id', 'session_id', 'stars']]
        avg = avg.loc[avg.stars !=
                      0]  # va calcolata la media solo sui non zero
        avg = pd.DataFrame(
            avg.groupby(['user_id', 'session_id'])['stars'].progress_apply(
                lambda x: int(x.sum() / x.size))).fillna(0)
        avg = avg.rename(columns={'stars': 'stars_avg'})
        avg.stars = avg.stars_avg.astype(int)
        no_stars = df_clk_rat_star.loc[df_clk_rat_star.stars == 0,
                                       ['user_id', 'session_id', 'item_id']]
        stars_filled = pd.merge(no_stars,
                                avg,
                                how='left',
                                on=['user_id', 'session_id']).fillna(0)
        stars_filled.stars = stars_filled.stars_avg.astype(int)
        df_clk_rat_star_filled = pd.merge(
            df_clk_rat_star,
            stars_filled,
            how='left',
            on=['user_id', 'session_id', 'item_id'])
        for t in zip(df_clk_rat_star_filled.stars,
                     df_clk_rat_star_filled.stars_avg,
                     df_clk_rat_star_filled.index):
            if t[0] == 0:
                df_clk_rat_star_filled.at[t[2], 'stars'] = t[1]
        df_clk_rat_star_filled = df_clk_rat_star_filled.drop('stars_avg', 1)

        # now fill missing values for rating
        avg = df_clk_rat_star_filled[['user_id', 'session_id', 'rating']]
        avg.rating = avg.rating.astype(int)
        avg = avg.loc[avg.rating !=
                      1]  # va calcolata la media solo sui non zero
        avg = pd.DataFrame(
            avg.groupby(['user_id', 'session_id'])['rating'].progress_apply(
                lambda x: int(x.sum() / x.size))).fillna(0)
        avg = avg.rename(columns={'rating': 'rating_avg'})
        avg.rating = avg.rating_avg.astype(int)
        no_rat = df_clk_rat_star.loc[df_clk_rat_star.rating == 1,
                                     ['user_id', 'session_id', 'item_id']]
        rat_filled = pd.merge(no_rat,
                              avg,
                              how='left',
                              on=['user_id', 'session_id']).fillna(0)
        rat_filled.rating = rat_filled.rating_avg.astype(int)
        df_clk_rat_star_rat_filled = pd.merge(
            df_clk_rat_star_filled,
            rat_filled,
            how='left',
            on=['user_id', 'session_id', 'item_id'])
        for t in zip(df_clk_rat_star_rat_filled.rating,
                     df_clk_rat_star_rat_filled.rating_avg,
                     df_clk_rat_star_rat_filled.index):
            if t[0] == 1:
                df_clk_rat_star_rat_filled.at[t[2], 'rating'] = t[1]
        df_clk_rat_star_rat_filled = df_clk_rat_star_rat_filled.drop(
            'rating_avg', 1)

        # add feature column
        new_col = []
        df_clk_rat_star_rat_filled.rating = df_clk_rat_star_rat_filled.rating.astype(
            int)
        df_clk_rat_star_rat_filled.stars = df_clk_rat_star_rat_filled.stars.astype(
            int)
        df_clk_rat_star_rat_filled.price = df_clk_rat_star_rat_filled.price.astype(
            int)

        for t in tqdm(
                zip(df_clk_rat_star_rat_filled.rating,
                    df_clk_rat_star_rat_filled.stars,
                    df_clk_rat_star_rat_filled.price)):
            new_col.append((1.5 * t[0] + t[1]) / t[2])
        df_clk_rat_star_rat_filled['price_quality'] = new_col
        final_feature = df_clk_rat_star_rat_filled[[
            'user_id', 'session_id', 'item_id', 'price_quality'
        ]]

        return final_feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        # get only non-last-clickout clickout rows
        last_clickout_indices = find(df)
        last_clk_removed_df = df.drop(last_clickout_indices)
        reference_rows = last_clk_removed_df[
            (last_clk_removed_df.reference.str.isnumeric() == True)
            & (last_clk_removed_df.action_type == 'clickout item')][[
                'user_id', 'session_id', 'reference', 'impressions'
            ]]

        # get the impressions
        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists
                    for x in l]  # convert multi-dim list in 1-dim list
        c = dict(
            Counter(big_list)
        )  # count occurence of each accomodation in the impression list

        # create df from dictonary: for each accomodation tells the number of times it appears in impressions
        df_times_in_impressions = pd.DataFrame.from_dict(
            c, orient='index', columns=['number_of_times_in_impr'])
        df_times_in_impressions[
            'item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(
            columns=['item_id', 'number_of_times_in_impr'])

        # get number of times an accomodation has been clicked
        df_item_clicks = (reference_rows.groupby(
            ["reference"]).size().reset_index(name="n_clickouts"))
        df_item_clicks = df_item_clicks.rename(
            columns={'reference': 'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)

        # merge the two df
        merged = pd.merge(df_times_in_impressions,
                          df_item_clicks,
                          how='left',
                          on=['item_id']).fillna(0)
        merged.n_clickouts = merged.n_clickouts.astype(int)
        merged['perc_click_appeared'] = round(
            (merged.n_clickouts * 100) / (merged.number_of_times_in_impr), 2)

        # create the feature for each item
        feature_per_item = merged[['item_id', 'perc_click_appeared']]

        # use the feature for each last clickout
        clickout_rows = df.loc[last_clickout_indices,
                               ['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_rows)
        final_feature = pd.merge(clk_expanded,
                                 feature_per_item,
                                 how='left',
                                 on=['item_id']).fillna(0)
        final_feature = final_feature[[
            'user_id', 'session_id', 'item_id', 'perc_click_appeared'
        ]]

        return final_feature
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get last clickout rows
        last_clickout_indices = find(df)
        clickout_rows = df.loc[
            last_clickout_indices,
            ['user_id', 'session_id', 'city', 'reference', 'impressions']][
                df.action_type == 'clickout item']
        # get reference rows WITH last clickout
        reference_rows = df[(df.reference.str.isnumeric() == True)
                            & (df.action_type == 'clickout item')]
        # compute popularity WITH last clickout
        df_item_clicks = (reference_rows.groupby(
            ["reference",
             "city"]).size().reset_index(name="n_interactions_per_item"))
        df_item_clicks = df_item_clicks.rename(
            columns={'reference': 'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)

        df_city_clicks = (reference_rows.groupby('city').size().reset_index(
            name="n_interactions_per_city"))
        # merge clickout rows expanded with the popularity dataframes
        merged_df = pd.merge(df_item_clicks,
                             df_city_clicks,
                             how='left',
                             on=['city']).fillna(0)
        clk_expanded = expand_impressions(clickout_rows)
        feature = pd.merge(clk_expanded,
                           merged_df,
                           how='left',
                           on=['item_id', 'city']).fillna(0)
        # compute the percentage of clicks per platfom
        new_col = []
        feature.reference = feature.reference.astype(int)
        feature.item_id = feature.item_id.astype(int)
        for t in tqdm(
                zip(feature.reference, feature.item_id,
                    feature.n_interactions_per_item,
                    feature.n_interactions_per_city)):
            if t[0] == t[1]:  # è quello cliccato
                if t[3] != 1:
                    percentage_of_total_city_clk = round(
                        ((t[2] - 1) * 100.0) / (t[3] - 1), 5)
                else:
                    percentage_of_total_city_clk = 0
            else:  # non è quello cliccato
                if 0 not in [t[2], t[3]] and t[3] != 1:
                    percentage_of_total_city_clk = round(
                        (t[2] * 100.0) / (t[3] - 1),
                        5)  # tolgo comunque il click per plat
                else:
                    percentage_of_total_city_clk = 0
            new_col.append(percentage_of_total_city_clk)
        feature['adj_percentage_of_total_city_clk'] = new_col
        feature.adj_percentage_of_total_city_clk = feature.adj_percentage_of_total_city_clk.astype(
            float)
        final_feature_reduced = feature[[
            'user_id', 'session_id', 'item_id',
            'adj_percentage_of_total_city_clk'
        ]]

        return final_feature_reduced
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # first step: get all the platforms
        platforms = sorted(df.platform.unique().tolist())
        # create df that for each plat will hold the feature vector
        df_plat_feature = pd.DataFrame(
            columns=['platform', 'properties_array'])
        df_plat_feature['platform'] = platforms
        # remove last clickouts and do some preprocessing
        last_indices = find(df)
        df_clickout = df[(df.reference.str.isnumeric() == True)
                         & (df['action_type'] == 'clickout item')][[
                             'reference', 'platform'
                         ]]
        df_clickout = df_clickout.rename(columns={'reference': 'item_id'})
        df_clickout.item_id = df_clickout.item_id.astype(int)
        # get the item metadata in one hot
        o = ImpressionFeature(mode=self.mode)
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop([
            'properties1 Star', 'properties2 Star', 'properties3 Star',
            'properties4 Star', 'properties5 Star'
        ], 1)
        # merge clickouts dataframe with the metadata
        df_clicks_properties = pd.merge(df_clickout,
                                        df_accomodations,
                                        how='left',
                                        on=['item_id'])
        # extract the one hot econded feature into a 1-dim numpy array
        array = df_accomodations.drop(['item_id'], axis=1).values
        # for each item append the features as numpy array
        df_item_features = pd.DataFrame(columns=['item_id', 'features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)
        # for each column compute the sum of all the clickout-rows' features
        new_col = []  # which will hold the platform feature vector
        for p in tqdm(platforms):
            df_clicks_properties_per_plat = df_clicks_properties[
                df_clicks_properties.platform == p]
            df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop(
                ['item_id', 'platform'], axis=1)
            df_sum = df_clicks_properties_per_plat.sum()
            # questo if serve perché ci sono plat che non compaiono nei clickout
            # per quelle metto un vettore di 0
            if df_clicks_properties_per_plat.shape[0] != 0:
                df_sum = df_sum.apply(
                    lambda x: x / df_clicks_properties_per_plat.shape[0])
                plat_feature = df_sum.values
            else:
                plat_feature = np.asarray(
                    [0] * df_clicks_properties_per_plat.shape[1])
            new_col.append(plat_feature)
        df_plat_feature['properties_array'] = new_col

        # now take the last clickout rows and expand on the impression list
        clickout_rows = df.loc[last_indices, [
            'user_id', 'session_id', 'platform', 'action_type', 'impressions'
        ]][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)
        clk_expanded = clk_expanded.drop(['index', 'action_type'], axis=1)
        # for each impression, add the feature vector of the platform and the feature vector of the impression
        clk_expanded_wt_plat_feat = pd.merge(clk_expanded,
                                             df_plat_feature,
                                             how='left',
                                             on=['platform'])
        final_feature = pd.merge(clk_expanded_wt_plat_feat,
                                 df_item_features,
                                 how='left',
                                 on=['item_id'])
        # compute the similarity between the impression's feature vector and the plat feature vector
        new_col = []
        if self.metric == 'cosine':
            shrink = 5  # TRY ME
            for t in tqdm(
                    zip(final_feature.properties_array,
                        final_feature.features_array)):
                new_col.append(
                    cosine_similarity(t[0].astype(np.double),
                                      t[1].astype(np.double), shrink))

        final_feature = final_feature[['user_id', 'session_id', 'item_id']]
        final_feature['adj_platform_features_similarity'] = new_col
        return final_feature
Пример #22
0
def merge_features_tf(mode, cluster, features_array, stacking_scores_path):

    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs=find_last_clickout_indices(full_df)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    print('retrieve vali_idxs')
    # if the mode is full we don't have the validation if the mode is small or local the validation is performed
    # on the target indices

    vali_test_idxs = data.target_indices(mode, cluster)


    # construct the validation train and test df_base
    print('construct test and vali df')
    validation_test_df = click_df.loc[vali_test_idxs]

    all_idxs = click_df.index.values

    # find the differences
    print('construct train df')
    train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True)
    train_df = click_df.loc[train_idxs]

    # expand the impression as rows
    print('expand the impression')
    train_df = expand_impressions(train_df)[['user_id', 'session_id', 'item_id', 'index']]
    train_df['dummy_step']=np.arange(len(train_df))
    validation_test_df = expand_impressions(validation_test_df)[['user_id', 'session_id', 'item_id', 'index']]
    validation_test_df['dummy_step'] = np.arange(len(validation_test_df))

    # do the join
    print('join with the features')
    print(f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}')
    context_features_id = []
    for f in features_array:
        if type(f) == tuple:
            feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1])
        else:
            feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True)
        print(f'columns of the feature:\n {feature.columns}')
        print(f'NaN values are: {feature.isnull().values.sum()}')
        # if there are none fill it with -1
        feature.fillna(-1, inplace=True)
        # check if it is a feature of the impression
        if 'item_id' not in feature.columns:
            for i in range(train_df.shape[1]-6+1, train_df.shape[1]-6+1+feature.shape[1]-2, 1):
                context_features_id.append(str(i))
        print(f'session features names:{context_features_id}')
        print(f'shape of feature: {feature.shape}')
        print(f'len of feature:{len(feature)}')
        train_df = train_df.merge(feature)
        validation_test_df = validation_test_df.merge(feature)
        print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}')

    if len(stacking_scores_path)>1:
        for path in stacking_scores_path:
            score = pd.read_csv(path)
            cols = [c for c in score.columns if c in ['user_id', 'session_id', 'item_id'] or 'score' in c]
            score = score[cols]
            #if 'rnn' in path:
            score = score.groupby(['user_id', 'session_id', 'item_id'], as_index=False).last()
            train_df = train_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left')
            validation_test_df = validation_test_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left')
            print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}')

    train_df.fillna(0, inplace=True)
    validation_test_df.fillna(0, inplace=True)

    print('sorting by index and step...')
    # sort the dataframes
    train_df.sort_values(['index', 'dummy_step'], inplace=True)
    train_df.drop('dummy_step', axis=1, inplace=True
                  )
    validation_test_df.sort_values(['index', 'dummy_step'], inplace=True)
    validation_test_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return train_df, validation_test_df, np.array(context_features_id)
    def extract_feature(self):
        def get_pos(item, rec):
            res = np.empty(item.shape)
            for i in tqdm(range(len(item))):
                if str(item[i]) in rec[i]:
                    res[i] = rec[i].index(str(item[i])) + 1
                else:
                    res[i] = -1
            return res.astype(int)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get ALL the clk rows with also last clickouts
        all_clk_rows = df[(df.reference.str.isnumeric() == True)
                          & (df.action_type == 'clickout item')][[
                              'user_id', 'session_id', 'reference',
                              'impressions'
                          ]]
        all_clk_rows.impressions = all_clk_rows.impressions.str.split('|')
        # add the position
        pos_col = get_pos(all_clk_rows.reference.values,
                          all_clk_rows.impressions.values)
        all_clk_rows['position'] = pos_col
        all_clk_rows = all_clk_rows.drop('impressions', 1)
        # compute the popularity for each cluster
        df_clicks_1_pos = (all_clk_rows[all_clk_rows.position == 1].groupby(
            "reference").size().reset_index(name="pop_1_pos"))
        df_clicks_1_pos.reference = df_clicks_1_pos.reference.astype(int)
        df_clicks_1_pos = df_clicks_1_pos.rename(
            columns={'reference': 'item_id'})
        # pos 2 to 5
        df_clicks_2to5_pos = (all_clk_rows[(all_clk_rows.position > 1)
                                           & (all_clk_rows.position <= 5)].
                              groupby("reference").size().reset_index(
                                  name="pop_2to5_pos"))
        df_clicks_2to5_pos.reference = df_clicks_2to5_pos.reference.astype(int)
        df_clicks_2to5_pos = df_clicks_2to5_pos.rename(
            columns={'reference': 'item_id'})
        # pos 6 to 10
        df_clicks_6to10_pos = (all_clk_rows[(all_clk_rows.position > 5)
                                            & (all_clk_rows.position <= 10)].
                               groupby("reference").size().reset_index(
                                   name="pop_6to10_pos"))
        df_clicks_6to10_pos.reference = df_clicks_6to10_pos.reference.astype(
            int)
        df_clicks_6to10_pos = df_clicks_6to10_pos.rename(
            columns={'reference': 'item_id'})
        # pos 11 to 15
        df_clicks_11to15_pos = (all_clk_rows[(all_clk_rows.position > 10)
                                             & (all_clk_rows.position <= 15)].
                                groupby("reference").size().reset_index(
                                    name="pop_11to15_pos"))
        df_clicks_11to15_pos.reference = df_clicks_11to15_pos.reference.astype(
            int)
        df_clicks_11to15_pos = df_clicks_11to15_pos.rename(
            columns={'reference': 'item_id'})
        # pos 16 to 25
        df_clicks_16to25_pos = (all_clk_rows[(all_clk_rows.position > 15)
                                             & (all_clk_rows.position <= 25)].
                                groupby("reference").size().reset_index(
                                    name="pop_16to25_pos"))
        df_clicks_16to25_pos.reference = df_clicks_16to25_pos.reference.astype(
            int)
        df_clicks_16to25_pos = df_clicks_16to25_pos.rename(
            columns={'reference': 'item_id'})
        # now merge with the last clickouts expanded
        last_clickout_indices = find(df)
        last_clk_rows = df.loc[
            last_clickout_indices,
            ['user_id', 'session_id', 'reference', 'impressions']]
        last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|')
        clk_expanded = expand_impressions(last_clk_rows)
        clk_expanded = clk_expanded.drop('index', 1)
        # add position
        pos_col = get_pos(clk_expanded.item_id.values,
                          clk_expanded.imp_list.values)
        clk_expanded['position'] = pos_col
        clk_expanded = clk_expanded.drop('imp_list', 1)
        # merge :)
        merged = pd.merge(clk_expanded,
                          df_clicks_1_pos,
                          how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged, df_clicks_2to5_pos, how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_clicks_6to10_pos,
                          how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_clicks_11to15_pos,
                          how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_clicks_16to25_pos,
                          how='left',
                          on='item_id').fillna(0)
        # CIAO PICCIO
        # add column of popularity per cluster
        new_col = []
        for t in tqdm(
                zip(merged.position, merged.pop_1_pos, merged.pop_2to5_pos,
                    merged.pop_6to10_pos, merged.pop_11to15_pos,
                    merged.pop_16to25_pos)):
            if t[0] == 1:
                new_col.append(t[1])
            elif 1 < t[0] <= 5:
                new_col.append(t[2])
            elif 5 < t[0] <= 10:
                new_col.append(t[3])
            elif 10 < t[0] <= 15:
                new_col.append(t[4])
            elif 15 < t[0] <= 25:
                new_col.append(t[5])
        merged['pop_per_pos'] = new_col
        merged = merged.drop([
            'pop_1_pos', 'pop_2to5_pos', 'pop_6to10_pos', 'pop_11to15_pos',
            'pop_16to25_pos'
        ],
                             axis=1)
        # now compute the number of time that each item is impressed for each cluster position
        all_clks = df[(df.reference.str.isnumeric() == True)
                      & (df.action_type == 'clickout item')][[
                          'user_id', 'session_id', 'impressions'
                      ]]
        all_clks['imp_list'] = all_clks.impressions.str.split('|')
        all_clk_rows_expanded = expand_impressions(all_clks)
        pos_col = get_pos(all_clk_rows_expanded.item_id.values,
                          all_clk_rows_expanded.imp_list.values)
        all_clk_rows_expanded['position'] = pos_col
        # first pos
        all_clk_rows_expanded = all_clk_rows_expanded[[
            'user_id', 'session_id', 'item_id', 'position'
        ]]
        df_impressions_1 = (
            all_clk_rows_expanded[all_clk_rows_expanded.position == 1].groupby(
                ["item_id"]).size().reset_index(name="n_times_in_position_1"))
        df_impressions_1.reference = df_impressions_1.item_id.astype(int)
        # pos 2 to 5
        df_impressions_2to5 = (all_clk_rows_expanded[
            (all_clk_rows_expanded.position > 1)
            & (all_clk_rows_expanded.position <= 5)].groupby([
                "item_id"
            ]).size().reset_index(name="n_times_in_position_2to5"))
        df_impressions_2to5.reference = df_impressions_2to5.item_id.astype(int)
        # pos 6 to 10
        df_impressions_6to10 = (all_clk_rows_expanded[
            (all_clk_rows_expanded.position > 5)
            & (all_clk_rows_expanded.position <= 10)].groupby([
                "item_id"
            ]).size().reset_index(name="n_times_in_position_6to10"))
        df_impressions_6to10.reference = df_impressions_6to10.item_id.astype(
            int)
        # pos 11 to 15
        df_impressions_11to15 = (all_clk_rows_expanded[
            (all_clk_rows_expanded.position > 10)
            & (all_clk_rows_expanded.position <= 15)].groupby([
                "item_id"
            ]).size().reset_index(name="n_times_in_position_11to15"))
        df_impressions_11to15.reference = df_impressions_11to15.item_id.astype(
            int)
        # pos 16 to 25
        df_impressions_16to25 = (all_clk_rows_expanded[
            (all_clk_rows_expanded.position > 15)
            & (all_clk_rows_expanded.position <= 25)].groupby([
                "item_id"
            ]).size().reset_index(name="n_times_in_position_16to25"))
        df_impressions_16to25.reference = df_impressions_16to25.item_id.astype(
            int)
        # merge with the expanded last clickouts
        merged = pd.merge(merged, df_impressions_1, how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_impressions_2to5,
                          how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_impressions_6to10,
                          how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_impressions_11to15,
                          how='left',
                          on='item_id').fillna(0)
        merged = pd.merge(merged,
                          df_impressions_16to25,
                          how='left',
                          on='item_id').fillna(0)
        # add the new column
        new_col = []
        for t in tqdm(
                zip(merged.position, merged.n_times_in_position_1,
                    merged.n_times_in_position_2to5,
                    merged.n_times_in_position_6to10,
                    merged.n_times_in_position_11to15,
                    merged.n_times_in_position_16to25)):
            if t[0] == 1:
                new_col.append(t[1])
            elif 1 < t[0] <= 5:
                new_col.append(t[2])
            elif 5 < t[0] <= 10:
                new_col.append(t[3])
            elif 10 < t[0] <= 15:
                new_col.append(t[4])
            elif 15 < t[0] <= 25:
                new_col.append(t[5])
        # <3
        merged['n_times_impr'] = new_col
        merged = merged.drop([
            'n_times_in_position_1', 'n_times_in_position_2to5',
            'n_times_in_position_6to10', 'n_times_in_position_11to15',
            'n_times_in_position_16to25'
        ], 1)
        # now compute the feature, remembering:
        # - subtract 1 to the popularity for the clicked items
        # - subtract 1 to each impression position (bc the number of times is calculated on the whole dataset)
        new_col = []
        merged.reference = merged.reference.astype(int)
        merged.item_id = merged.item_id.astype(int)
        for t in tqdm(
                zip(merged.reference, merged.item_id, merged.pop_per_pos,
                    merged.n_times_impr)):
            if t[3] > 1:
                if t[0] == t[1]:
                    new_col.append(((t[2] - 1) * 100) / (t[3] - 1))
                else:
                    new_col.append(((t[2]) * 100) / (t[3] - 1))
            else:
                new_col.append(0)
        merged['perc_click_per_pos'] = new_col

        return merged[[
            'user_id', 'session_id', 'item_id', 'perc_click_per_pos'
        ]]
    def extract_feature(self):
        """
        Computes all user features.
        Must distinsuish between past sessions and future sessions, and for each compute same features.
        This will help understand the moves of the user through the impressions
        """
        train_df = data.train_df(mode=self.mode, cluster=self.cluster)
        test_df = data.test_df(mode=self.mode, cluster=self.cluster)
        test_df = test_df.fillna(0)
        df = pd.concat([train_df, test_df])

        print('Adjusting session bastarde ...')
        df_to_correct = df[df.session_id.isin(session_bastarde)]
        df = df[~df.session_id.isin(session_bastarde)]

        for i in tqdm(df_to_correct.index):
            if df_to_correct.at[i, 'step'] > dict_sess_bast[df_to_correct.at[i, 'session_id']]:
                df_to_correct = df_to_correct.drop(i, axis=0)

        df = pd.concat([df, df_to_correct])

        df.sort_values(by=['user_id', 'session_id', 'timestamp'], inplace=True)
        df = df.reset_index(drop=True)

        i = 0

        idxs_click = find_last_clickout(df)

        users = df.user_id.values

        pbar = tqdm(total=len(idxs_click))

        # I will need a copy when iterating later
        idx_to_compute = idxs_click.copy()

        while i < idxs_click[-1]:
            initial_i = i

            user = df.at[i, 'user_id']

            # Get all user sessions indices
            for u in users[i:]:
                if u != user:
                    break
                i += 1

            # Now i start creating the features for every session

            sessions_user_idxs = []
            while len(idx_to_compute) > 0 and idx_to_compute[0] < i:
                sessions_user_idxs += [idx_to_compute.pop(0)]
            sessions_count = len(sessions_user_idxs)

            if sessions_count > 1:
                # Start computing features: keeping the last clickouts where to iterate for getting features
                user_sessions_df = df.iloc[sessions_user_idxs, :]

                df_only_user = df.iloc[initial_i:i, :]

                df_only_user = df_only_user.reset_index(drop=True)

                # Iterating over clickouts to predict of session: computing features
                for idx, row in user_sessions_df.iterrows():
                    curr_session = row.session_id

                    # Get a session, get the impressions
                    impressions = list(map(str, row.impressions.split('|')))

                    df_samecity = df_only_user  # [df_only_user.city == row.city]
                    idx = list(df_samecity.session_id.values).index(curr_session)


                    # Get index of df where considered session starts and ends
                    idx_session_initial = idx
                    idx_session_final = len(df_samecity) - list(df_samecity.session_id.values)[::-1].index(curr_session)

                    if df_samecity.index.values[0] < idx_session_initial:
                        temp_df = df_samecity.iloc[0:idx_session_initial, :]
                        self.compute_past_sessions_feat(
                            temp_df[temp_df.city == row.city], impressions,
                            int(df_only_user.at[idx_session_initial, 'timestamp']))
                    else:
                        self.add_empty_features(impressions, 'past')

                    tm_clk = int(row['timestamp'])
                    df_samecity = df_samecity.iloc[idx_session_final:len(df_samecity), :]

                    df_samecity = df_samecity[df_samecity.city == row.city]
                    if len(df_samecity) > 0:
                        self.compute_future_sessions_feat(df_samecity, impressions,
                                                          tm_clk)
                    else:
                        self.add_empty_features(impressions, 'future')

            else:
                # Return all features -1, if at least a session exists
                if sessions_count == 1:
                    # Case one session for one user:
                    clk_idx = sessions_user_idxs[0]
                    impressions = df.at[clk_idx, 'impressions'].split('|')
                    self.add_empty_features(impressions, 'both')

            pbar.update(sessions_count)

        pbar.close()

        df = expand_impressions(df.iloc[idxs_click, :][['user_id', 'session_id', 'reference', 'impressions']])

        for key in self.features.keys():
            print(key, len(self.features[key]))
            df[key] = self.features[key]

        print('Correcting feature: add duplicate sessions with underscore...')
        label_feat = pd.read_csv('dataset/preprocessed/{}/{}/feature/impression_label/features.csv'.format(self.cluster, self.mode))
        df = self.adjust_features(df, label_feat)

        df.drop(['index', 'reference'], axis=1, inplace=True)
        return df
Пример #25
0
    def extract_feature(self):

        ######### READING DATA
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])


        ######## SOME PREPROCESS + SECONDARY DATA STRUCTURE TO SPEED UP PERFOMANCES
        clickout_indices = find(df)
        clickout_df = df.loc[clickout_indices]
        clickout_sessions = list(clickout_df.session_id)
        session_to_impressions = dict()
        user_to_sessions = dict()
        session_to_timestamp = dict()
        for t in tqdm(zip(clickout_df.session_id, clickout_df.impressions, clickout_df.user_id, clickout_df.timestamp)):
            if t[2] not in user_to_sessions:
                user_to_sessions[t[2]] = list()
            user_to_sessions[t[2]] += [t[0]]
            session_to_impressions[t[0]] = list(map(int, t[1].split("|")))
            session_to_timestamp[t[0]] = t[3]

        # Cleaning df from clickout sessions and not numeric reference (i.e Castellammare di Stabbia, NA) and from users
        # which are not in the test set
        clean_df = df[(~df.session_id.isin(clickout_sessions)) & (df.reference.apply(lambda x: type(x) == str and x.isdigit())) \
                        & (df.user_id.isin(user_to_sessions.keys()))]
        clean_df["reference"] = pd.to_numeric(clean_df["reference"])
        grouped = clean_df.groupby("user_id")
        session_to_df = dict()
        for name, group in tqdm(grouped, desc="Scanning users and create enriched sessions dataframe"):
            group = group.sort_values("timestamp")
            sessions = user_to_sessions[name]
            #Attach to each session a small df containing only the rows useful for the computation of the feature
            for s in sessions:
                imps = session_to_impressions[s]
                temp = group[group.reference.isin(imps)]
                session_to_df[s] = temp

        print(len(session_to_df))

        #### FEATURE KERNEL

        # Action mapping on indices
        # 0 -> time_from_last_interaction
        # Action <-> index of the array
        time_last_interaction_past = 0
        action_dict_past = {
            "search for item" : 1,
            "interaction item image" : 2,
            "interaction item info" : 3,
            "interaction item deals" : 4,
            "interaction item rating" : 5,
            "clickout item" : 6}
        time_first_interaction_future = 7
        action_dict_future = {
            "search for item" : 8,
            "interaction item image" : 9,
            "interaction item info" : 10,
            "interaction item deals" :11,
            "interaction item rating" : 12,
            "clickout item" : 13}
        imp_to_actions = dict()
        session_to_feature = dict()
        for k, v in tqdm(session_to_timestamp.items(), desc="Scanning sessions to generate feature"):
            # if not, we don't have any information from past or future, from that user, for that impressions
            if k in session_to_df:
                temp = session_to_df[k]
                past = temp[temp.timestamp <= v].sort_values("timestamp")
                future = temp[temp.timestamp > v].sort_values("timestamp", ascending=False)
                imps = session_to_impressions[k]
                imp_to_actions = dict()
                for i in imps:
                    # + 2 due to "time_from_last_interaction", both past and future
                    imp_to_actions[i] = np.zeros(len(action_dict_past) + len(action_dict_future) + 2)
                    imp_to_actions[i][time_last_interaction_past] = -1
                    imp_to_actions[i][time_first_interaction_future] = -1
                for t in zip(past.reference, past.action_type, past.timestamp):
                    imp = t[0]
                    action_index = action_dict_past[t[1]]
                    imp_to_actions[imp][time_last_interaction_past] = v - t[2]
                    imp_to_actions[imp][action_index] += 1
                for t in zip(future.reference, future.action_type, future.timestamp):
                    imp = t[0]
                    action_index = action_dict_future[t[1]]
                    imp_to_actions[imp][time_first_interaction_future] = t[2] - v
                    imp_to_actions[imp][action_index] += 1
                session_to_feature[k] = imp_to_actions

        #### UNROLLING DICT TO DATAFRAME
        lines = list()
        for k, v in tqdm(session_to_feature.items(), desc="Dicts to dataframe"):
            for imp, feature in v.items():
                lines.append([k] + [imp] + list(feature))
        new_df = pd.DataFrame(lines, columns=["session_id", "item_id", "time_last_past_interaction", "search_for_item_past",
                                              "interaction_item_image_past", "interaction_item_info_past", "interaction_item_deals_past",
                                              "interaction_item_rating_past", "clickout_item_past", "time_first_future_interaction", "search_for_item_future",
                                               "interaction_item_image_future", "interaction_item_info_future", "interaction_item_deals_future",
                                              "interaction_item_rating_future", "clickout_item_future"])

        #### MERGING WITH MAIN DATAFRAME

        clickout_df = clickout_df[['user_id', 'session_id', 'impressions']]
        clk_expanded = expand_impressions(clickout_df)
        print("Temp feature (only rows not null) shape: {}".format(new_df.shape))
        print("Expanded dataframe shape: {}".format(clk_expanded.shape))
        feature = pd.merge(clk_expanded, new_df, how="left")
        feature[["search_for_item_past","interaction_item_image_past", "interaction_item_info_past", "interaction_item_deals_past",
                "interaction_item_rating_past", "clickout_item_past", "search_for_item_future",
                "interaction_item_image_future", "interaction_item_info_future", "interaction_item_deals_future",
                "interaction_item_rating_future", "clickout_item_future"]] = \
            feature[["search_for_item_past","interaction_item_image_past", "interaction_item_info_past", "interaction_item_deals_past",
                "interaction_item_rating_past", "clickout_item_past", "search_for_item_future",
                "interaction_item_image_future", "interaction_item_info_future", "interaction_item_deals_future",
                "interaction_item_rating_future", "clickout_item_future"]].fillna(value=0)
        feature.replace(-1, np.nan)
        print("Final feature shape: {}".format(feature.shape))
        return feature
def merge_features_lgb(mode, cluster, features_array):

    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs = find_last_clickout_indices(full_df)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    print('retrieve vali_idxs')
    # if the mode is full we don't have the validation if the mode is small or local the validation is performed
    # on the target indices

    vali_test_idxs = data.target_indices(mode, cluster)

    # construct the validation train and test df_base
    print('construct test and vali df')
    validation_test_df = click_df.loc[vali_test_idxs]

    all_idxs = click_df.index.values

    # find the differences
    print('construct train df')
    train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True)
    train_df = click_df.loc[train_idxs]

    # expand the impression as rows
    print('expand the impression')
    train_df = expand_impressions(train_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    train_df['dummy_step'] = np.arange(len(train_df))
    validation_test_df = expand_impressions(validation_test_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    validation_test_df['dummy_step'] = np.arange(len(validation_test_df))

    # do the join
    print('join with the features')
    print(
        f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}'
    )
    time_joins = 0
    for f in features_array:
        _feature = f(mode=mode, cluster='no_cluster')
        feature = _feature.read_feature(one_hot=False)

        print(f'shape of feature: {feature.shape}\n')
        print(f'len of feature:{len(feature)}\n')

        start = time()
        train_df = train_df.merge(feature)
        validation_test_df = validation_test_df.merge(feature)
        print(f'time to do the join: {time()-start}')
        time_joins += time() - start
        print(
            f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}'
        )

    print(f'total time to do joins: {time_joins}')

    print('sorting by index and step...')
    # sort the dataframes
    train_df.sort_values(['index', 'dummy_step'], inplace=True)
    train_df.drop('dummy_step', axis=1, inplace=True)
    validation_test_df.sort_values(['index', 'dummy_step'], inplace=True)
    validation_test_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return train_df, validation_test_df
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        platforms = df['platform'].unique().tolist()
        df_plat_feature = pd.DataFrame(columns=['platform','properties_array'])
        df_plat_feature['platform'] = platforms
        last_indices = find(df)
        df_non_last_clk = df.drop(last_indices)
        df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','platform']]
        df_clickout = df_clickout.rename(columns={'reference':'item_id'})
        df_clickout = df_clickout.dropna() # remove NaNs
        df_clickout.item_id = df_clickout.item_id.astype(int)
        o = ImpressionFeature(mode=self.mode)
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1)

        df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id'])
        array = df_accomodations.drop(['item_id'],axis=1).values
        df_item_features = pd.DataFrame(columns=['item_id','features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)

        new_col = []
        for p in tqdm(platforms):
            df_clicks_properties_per_plat = df_clicks_properties[df_clicks_properties.platform == p]
            df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop(['item_id','platform'], axis=1)
            df_sum = df_clicks_properties_per_plat.sum()
            if df_clicks_properties_per_plat.shape[0] !=0: # questo vuol dire che appare almeno una volta la plat
                plat_feature = df_sum.values
            else:
                plat_feature = np.asarray([0]*df_clicks_properties_per_plat.shape[1])
            new_col.append(plat_feature)

        df_plat_feature['properties_array'] = new_col
        global_sum = df_clicks_properties.drop(['item_id','platform'],1)
        global_sum = global_sum.sum().tolist()

        df_plat_feature['global_properties'] = df_plat_feature.apply(lambda x: global_sum, axis=1)
        properties_globally_normalized = []
        for t in tqdm(zip(df_plat_feature.properties_array, df_plat_feature.global_properties)):
            properties_globally_normalized.append(np.asarray([x/y for x,y in zip(t[0],t[1])]))

        df_plat_feature['properties_globally_normalized'] = properties_globally_normalized
        df_plat_feature = df_plat_feature.drop(['properties_array','global_properties'],1)

        # ora prendo il dataframe coi clickout solito
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','platform','action_type','impressions']][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)

        clk_expanded = clk_expanded.drop(['index','action_type'], axis = 1)
        clk_expanded_wt_plat_feat = pd.merge(clk_expanded, df_plat_feature, how='left', on=['platform']).astype(object)
        clk_expanded_wt_plat_feat.item_id = clk_expanded_wt_plat_feat.item_id.astype(int)

        final_feature = pd.merge(clk_expanded_wt_plat_feat, df_item_features, how='left', on=['item_id'])
        new_col =[]
        shrink = 0 # TRY ME
        for t in tqdm(zip(final_feature.properties_globally_normalized, final_feature.features_array)):
            new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink))

        new_feature = final_feature[['user_id','session_id','item_id']]
        new_feature['platform_similarity_normalized'] = new_col

        return new_feature