def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()
        df = df.sort_values(['user_id', 'session_id', 'timestamp',
                             'step']).reset_index()

        # find the last clickout rows
        last_clickout_idxs = find_last_clickout_indices(df)
        clickout_rows = df.loc[
            last_clickout_idxs,
            ['user_id', 'session_id', 'impressions', 'index']]
        clickout_rows[
            'impressions_count'] = clickout_rows.impressions.str.split(
                '|').str.len()
        clickout_rows = clickout_rows.drop('impressions', axis=1)

        # multi-hot the counts
        one_hot_counts = np.zeros((clickout_rows.shape[0], 25), dtype=np.int8)
        for i, c in tqdm(enumerate(clickout_rows.impressions_count.values)):
            one_hot_counts[i, 0:c] = 1

        # add to the clickouts
        for i in range(25):
            clickout_rows['impr_c{}'.format(i)] = one_hot_counts[:, i]

        return clickout_rows.drop('impressions_count',
                                  axis=1).set_index('index')
        def get_label(df):
            """ Return a dataframe with: index | user_id | session_id | label """
            # find the last clickout rows
            idxs = find_last_clickout_indices(df)

            res_df = df[['user_id', 'session_id', 'reference',
                         'impressions']].loc[idxs]
            # remove the test sessions with reference NaN
            res_df = res_df.dropna(subset=['reference']).astype(
                {'reference': 'int'})
            # create impressions list
            res_df['impressions_list'] = res_df['impressions'].str.split(
                '|').apply(lambda x: list(map(int, x)))
            res_df.drop('impressions', axis=1, inplace=True)

            label_series = np.zeros(res_df.shape[0], dtype='int8')
            # iterate over the rows
            k = 0
            for row in tqdm(
                    zip(res_df['reference'], res_df['impressions_list'])):
                ref = row[0]
                impress = row[1]
                if ref in impress:
                    label_series[k] = impress.index(ref)
                k += 1
            # add the new column
            res_df['label'] = label_series

            return res_df.drop(['reference', 'impressions_list'], axis=1)
Пример #3
0
        def retrieve_pd_dataframe_score(df):

            icm = data.accomodations_one_hot().sort_index()

            sess_user_dict = create_user_feature_dict(df)
            idxs_clicks = find_last_clickout_indices(df)

            scores = []
            # iterate on the index of the target clicks and create for each iteration a tuple to be appended on the final list
            print('computing the distances...')
            for idx in tqdm(idxs_clicks):

                # retrieve the user sess and impressions of the click
                user = df.at[idx, 'user_id']
                sess = df.at[idx, 'session_id']
                impressions = list(map(int, df.at[idx, 'impressions'].split('|')))

                # retrieve the impression of the user-sess pair if it isn't in the dictionary means
                # that there weren't numeric actions in the session so initialize it with an empty vector
                us_tuple = (user, sess)

                if us_tuple in sess_user_dict:
                    user_feature_vec = sess_user_dict[(user, sess)]
                else:
                    user_feature_vec = np.zeros(icm.shape[1])

                # retrieve the features of the impression selected
                features_imp = icm.loc[impressions].values

                # create the various version of the user vector CLIPPED, TRESHOLDED
                clipped_user_feature_vec = np.clip(user_feature_vec,0,1)

                tresholded_user_feature_vec = user_feature_vec.copy()
                if np.sum(user_feature_vec) > 0:
                    treshold_limit = np.mean(user_feature_vec[user_feature_vec > 0])
                    tresholded_user_feature_vec[tresholded_user_feature_vec<treshold_limit]=0

                # compute the distance between the two vectors
                _scores_manhattan = manhattan_distances(user_feature_vec.reshape(1, -1), features_imp)
                _scores_cosine = cosine_similarity(user_feature_vec.reshape(1, -1), features_imp)
                _scores_jaccard_no_norm = np.dot(user_feature_vec, features_imp.T)

                _scores_manhattan_clip = manhattan_distances(clipped_user_feature_vec.reshape(1, -1), features_imp)
                _scores_cosine_clip = cosine_similarity(clipped_user_feature_vec.reshape(1, -1), features_imp)
                _scores_jaccard_no_norm_clip = np.dot(clipped_user_feature_vec, features_imp.T)

                _scores_manhattan_tr = manhattan_distances(tresholded_user_feature_vec.reshape(1, -1), features_imp)
                _scores_cosine_tr = cosine_similarity(tresholded_user_feature_vec.reshape(1, -1), features_imp)
                _scores_jaccard_no_norm_tr = np.dot(tresholded_user_feature_vec, features_imp.T)

                # create and append a tuple on the final list
                for i in range(len(impressions)):
                    scores.append((user, sess, impressions[i],
                                   _scores_cosine[0][i], _scores_manhattan[0][i],_scores_jaccard_no_norm[i],
                                   _scores_cosine_clip[0][i], _scores_manhattan_clip[0][i],_scores_jaccard_no_norm_clip[i],
                                   _scores_cosine_tr[0][i], _scores_manhattan_tr[0][i],_scores_jaccard_no_norm_tr[i]))
            return pd.DataFrame(scores, columns=['user_id', 'session_id', 'item_id',
                                                 'scores_cosine', 'scores_manhatthan', 'scores_jaccard_no_norm',
                                                 'scores_cosine_clip', 'scores_manhatthan_clip', 'scores_jaccard_no_norm_clip',
                                                 'scores_cosine_tr', 'scores_manhatthan_tr', 'scores_jaccard_no_norm_tr'])
Пример #4
0
        def create_user_feature_dict(df):

            idxs_clicks = find_last_clickout_indices(df)
            df = df.drop(idxs_clicks)

            # retrieve the icm
            icm = data.accomodations_one_hot()

            # filter on the columns of interests
            temp_df = df[['user_id', 'session_id', 'reference']].dropna()

            # mantain only the rows with numeric reference
            temp_df = temp_df[temp_df['reference'].str.isnumeric()]
            temp_df = temp_df.drop_duplicates()

            # retrieve user_sess and item associated to it from the dataframe
            users_idxs = temp_df.to_dict('list')['user_id']
            sessions_idxs = temp_df.to_dict('list')['session_id']
            users_sess = list(zip(users_idxs, sessions_idxs))
            items_idxs = list(map(int, temp_df.to_dict('list')['reference']))

            # create a diction with keys tuples like ('user_id', 'session_id') and as value the array representing
            # the user as an house summing up all the features of the houses with wich he has interacted
            # during the session
            count = 0
            user_session_dict = {}
            for user_session in tqdm(users_sess):
                user_items = icm.loc[items_idxs[count]].values
                if user_session in user_session_dict:
                    user_session_dict[user_session] += user_items
                else:
                    user_session_dict[user_session] = user_items
                count += 1

            return user_session_dict
def train_indices(mode='local', cluster='no_cluster'):
    df_train = data.train_df(mode=mode, cluster=cluster)
    df_test = data.test_df(mode=mode, cluster=cluster)
    target_indices = data.target_indices(mode=mode, cluster=cluster)
    df = pd.concat([df_train, df_test])
    idx = find_last_clickout_indices(df)
    train_idx = set(idx) - set(target_indices)
    return train_idx
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        idxs_click = find_last_clickout_indices(df)
        temp = df[['user_id', 'session_id', 'step', 'timestamp']]
        session_id_l = []
        length_step_l = []
        length_timestamp_l = []
        timestamp_last_action_l = []
        final_timestamp_l = []
        user_id_l = []
        for i in tqdm(idxs_click):
            user_id = temp.at[i, 'user_id']
            session_id = temp.at[i, 'session_id']
            step = temp.at[i, 'step']
            f_timestamp = temp.at[i, 'timestamp']
            i_timestamp = temp.at[i - (step - 1), 'timestamp']
            if step > 1:
                timestamp_last_action = temp.at[i - 1, 'timestamp']
            else:
                timestamp_last_action = f_timestamp

            user_id_l.append(user_id)
            session_id_l.append(session_id)
            length_step_l.append(int(step))
            length_timestamp_l.append(int(f_timestamp - i_timestamp))
            timestamp_last_action_l.append(int(timestamp_last_action))
            final_timestamp_l.append(int(f_timestamp))
        final_df = pd.DataFrame({
            'user_id': user_id_l,
            'session_id': session_id_l,
            'length_step': length_step_l,
            'length_timestamp': length_timestamp_l,
            'timestamp_last_action': timestamp_last_action_l,
            'final_timestamp': final_timestamp_l
        })
        final_df['mean_time_action'] = final_df['length_timestamp'] / final_df[
            'length_step']

        final_df['elapsed_last_action_click'] = final_df[
            'final_timestamp'] - final_df['timestamp_last_action']

        final_df['elapsed_last_action_click_log'] = np.log(
            final_df['elapsed_last_action_click'] + 1)

        final_df['variance_last_action'] = (
            final_df['elapsed_last_action_click'] -
            final_df['mean_time_action'])**2

        final_df['std_last_action'] = abs(
            final_df['elapsed_last_action_click'] -
            final_df['mean_time_action'])

        final_df.drop(['timestamp_last_action', 'final_timestamp', 'mean_time_action', \
                       'length_step', 'length_timestamp', 'elapsed_last_action_click'], axis=1, inplace=True)
        return final_df
Пример #7
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        temp = df.fillna('0')
        idxs_click = sorted(find_last_clickout_indices(temp))
        idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index

        count = 0
        last_click = idxs_click[0]

        impr_features = {}
        impr_feature = []
        for i in tqdm(sorted(idxs_numeric_reference)):
            if i == last_click:
                impressions = list(map(int, temp.at[i, 'impressions'].split('|')))
                click_timestamp = temp.at[i, 'timestamp']
                click_step = temp.at[i, 'step']
                for impr in impressions:
                    if impr not in impr_features:
                        impr_feature.append({'num_interactions_impr': 0, 'step_from_last_interaction': -1,
                                             'timestamp_from_last_interaction': -1,
                                             'last_action_type_with_impr': 'None'})
                    else:
                        impr_features[impr]['timestamp_from_last_interaction'] = click_timestamp - impr_features[impr][
                            'timestamp_from_last_interaction']
                        impr_features[impr]['step_from_last_interaction'] = click_step - impr_features[impr][
                            'step_from_last_interaction']
                        impr_feature.append(impr_features[impr])
                impr_features = {}
                count += 1
                if count < len(idxs_click):
                    last_click = idxs_click[count]
                continue
            ref = int(temp.at[i, 'reference'])
            if ref in impr_features:
                impr_features[ref]['num_interactions_impr'] += 1
                impr_features[ref]['step_from_last_interaction'] = df.at[i, 'step']
                impr_features[ref]['timestamp_from_last_interaction'] = df.at[i, 'timestamp']
                impr_features[ref]['last_action_type_with_impr'] = df.at[i, 'action_type']
            else:
                impr_features[ref] = {'num_interactions_impr': 1, 'step_from_last_interaction': df.at[i, 'step'],
                                      'timestamp_from_last_interaction': df.at[i, 'timestamp'],
                                      'last_action_type_with_impr': df.at[i, 'action_type']}

        final_df = expand_impressions(temp[['user_id', 'session_id', 'impressions']].loc[idxs_click])
        print(len(final_df))
        print(len(impr_feature))
        final_df['dict'] = impr_feature

        features_df = pd.DataFrame(final_df.progress_apply(lambda x: tuple(x['dict'].values()), axis=1).tolist(),
                                   columns=list(final_df.iloc[0].dict.keys()))
        final_df_ = pd.concat([final_df, features_df], axis=1).drop('dict', axis=1)
        final_df_ = final_df_.drop(['num_interactions_impr', 'last_action_type_with_impr'], axis=1)
        return final_df_
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        idxs_click = find_last_clickout_indices(df)
        df = df.loc[idxs_click][[
            'user_id', 'session_id', 'impressions', 'prices'
        ]]

        impression_price_position_list = []
        fraction_pos_price_list = []
        for i in tqdm(df.index):
            impr = list(map(int, df.at[i, 'impressions'].split('|')))
            prices = list(map(int, df.at[i, 'prices'].split('|')))

            impression_position = np.arange(len(impr)) + 1

            couples = zip(prices, impression_position, impr)
            couples = sorted(couples, key=lambda a: a[0])

            prices_ordered, position, impressions_ordered = zip(*couples)

            _, price_pos = list(
                zip(*sorted(list(zip(position, impression_position)),
                            key=lambda a: a[0])))
            fraction_pos_price = list(impression_position / price_pos)

            fraction_pos_price_list.append(np.array(fraction_pos_price))
            impression_price_position_list.append(np.array(price_pos))
        df['impression_pos_price'] = impression_price_position_list

        df['impressions'] = df['impressions'].str.split('|')
        df['prices'] = df['prices'].str.split('|')

        final_df = pd.DataFrame({
            col: np.repeat(df[col], df['impressions'].str.len())
            for col in df.columns.drop(['impressions', 'prices'])
        }).assign(
            **{
                'item_id':
                np.concatenate(df['impressions'].values),
                'price':
                np.concatenate(df['prices'].values),
                'impression_pos_price':
                np.concatenate(df['impression_pos_price'].values)
            })

        final_df['item_id'] = pd.to_numeric(final_df['item_id'])
        final_df['impression_pos_price'] = pd.to_numeric(
            final_df['impression_pos_price'])
        final_df['price'] = pd.to_numeric(final_df['price'])

        return final_df
def get_class_to_sessions_dict(train):
    idxs = find_last_clickout_indices(train)
    train = train.loc[idxs]
    class_to_sessions = dict()
    for i in range(25):
        class_to_sessions[i] = list()
    for t in tqdm(zip(train.session_id, train.reference, train.impressions)):
        imps = list(map(int, t[2].split("|")))
        ref = int(t[1])
        if ref in imps:
            idx = imps.index(ref)
            class_to_sessions[idx] += [t[0]]
    return class_to_sessions
Пример #10
0
 def extract_feature(self):
     train = data.train_df(mode=self.mode, cluster=self.cluster)
     test = data.test_df(mode=self.mode, cluster=self.cluster)
     df = pd.concat([train, test])
     idxs_click = find_last_clickout_indices(df)
     tuple_list = []
     for i in idxs_click:
         user = df.at[i, 'user_id']
         sess = df.at[i, 'session_id']
         device = df.at[i, 'device']
         tuple_list.append((user, sess, device))
     return pd.DataFrame(
         tuple_list, columns=['user_id', 'session_id', 'session_device'])
Пример #11
0
def merge_features_tf_cv(mode, cluster, features_array):
    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs = find_last_clickout_indices(full_df)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    # expand the impression as rows
    print('expand the impression')
    click_df = expand_impressions(click_df)[['user_id', 'session_id', 'item_id', 'index']]
    click_df['dummy_step'] = np.arange(len(click_df))

    # do the join
    print('join with the features')
    print(f'train_shape: {click_df.shape}\n')
    context_features_id = []
    for f in features_array:
        if type(f) == tuple:
            feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1])
        else:
            feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True)
        print(f'columns of the feature:\n {feature.columns}')
        print(f'NaN values are: {feature.isnull().values.sum()}')
        # if there are none fill it with -1
        feature.fillna(0, inplace=True)
        # check if it is a feature of the impression
        if 'item_id' not in feature.columns:
            for i in range(click_df.shape[1] - 6 + 1, click_df.shape[1] - 6 + 1 + feature.shape[1] - 2, 1):
                context_features_id.append(str(i))
        print(f'session features names:{context_features_id}')
        print(f'shape of feature: {feature.shape}')
        print(f'len of feature:{len(feature)}')
        click_df = click_df.merge(feature)
        print(f'train_shape: {click_df.shape}\n ')

    print('sorting by index and step...')
    # sort the dataframes
    click_df.sort_values(['index', 'dummy_step'], inplace=True)
    click_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return click_df, np.array(context_features_id)
Пример #12
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        if self.mode in ['small', 'local']:
            print('reinserting clickout')
            test = test.groupby(['session_id',
                                 'user_id']).progress_apply(_reinsert_clickout)
        df = pd.concat([train, test])
        idxs_click = find_last_clickout_indices(df)
        df = df.loc[idxs_click][[
            'user_id', 'session_id', 'reference', 'impressions'
        ]]
        df = expand_impressions(df)
        df['label'] = (df['item_id'] == df['reference'].astype('float')) * 1
        df.drop(['index', 'reference'], axis=1, inplace=True)

        print(df)
        return df
    def extract_feature(self):

        df = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        if self.mode in ['small', 'local']:
            print('reinserting clickout')
            test = test.groupby(['session_id',
                                 'user_id']).progress_apply(_reinsert_clickout)
        df = pd.concat([df, test])
        idxs_click = find_last_clickout_indices(df)

        #& (df.reference.notnull())]
        #df = df.drop_duplicates("session_id", keep="last")
        #df = df[(df.reference.notnull()) & (df.index.isin(idxs_click))]
        labels = list()
        df = df[df.index.isin(idxs_click)]
        for t in tqdm(zip(df.reference, df.impressions)):
            if type(t[0]) != float:
                reference = int(t[0])
                impressions = list(map(int, t[1].split("|")))
                if reference in impressions and impressions.index(
                        reference) == 0:
                    labels.append(1)
                else:
                    labels.append(0)
            else:
                labels.append(np.nan)

        df = df[["user_id", "session_id"]]
        df["label"] = labels

        #add label for prediction on full_df
        # if self.mode == "full":
        #     test = data.test_df(mode=self.mode, cluster=self.cluster)
        #     print("Adding full test rows")
        #     test = test[test.index.isin(idxs_click)]
        #     test = test[["user_id", "session_id"]]
        #     df = pd.concat([df, test], sort=False)
        print(len(df))
        return df
Пример #14
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        idxs_click = find_last_clickout_indices(df)
        df = df.loc[idxs_click][['user_id', 'session_id', 'impressions']]
        df = expand_impressions(df)
        # initialize the session id
        session_id = ''
        count = 1
        impression_position = []
        for i in tqdm(df.index):
            c_session = df.at[i, 'session_id']
            if c_session != session_id:
                session_id = c_session
                count = 1
            impression_position.append(count)
            count += 1
        df['impression_position'] = impression_position
        df['impression_position'] = pd.to_numeric(df['impression_position'])
        df.drop('index', axis=1, inplace=True)

        return df
Пример #15
0
    def extract_feature(self):

        price_dict_df = ImpressionsAveragePrice().read_feature().set_index(
            'item_id')
        price_dict = price_dict_df.to_dict('index')

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        temp = df.fillna('0')
        idxs_click = sorted(find_last_clickout_indices(temp))
        idxs_numeric_reference = temp[temp['reference'].str.isnumeric() ==
                                      True].index

        count = 0
        last_click = idxs_click[0]

        features = []

        prices_interacted = []
        impression_interacted = {}
        prices_clickout_interacted = []

        for i in tqdm(sorted(idxs_numeric_reference)):

            if i == last_click:

                prices_click = sorted(
                    list(map(int, temp.at[i, 'prices'].split('|'))))

                mean_price_click = np.mean(np.array(prices_click))

                max_price_click = prices_click[-1]
                min_price_click = prices_click[0]
                var_prices_click = np.var(np.array(prices_click))

                support_interaction = len(prices_interacted)

                if support_interaction == 0:
                    last_price_interacted = -1
                    mean_price_interacted = -1
                    min_price_interacted = -1
                    max_price_interacted = -1
                    user_class = 'None'
                    var_price_interacted = -1
                    distance_max_price_from_mean = -1
                else:
                    last_price_interacted = prices_interacted[-1]
                    prices_interacted = sorted(prices_interacted)
                    mean_price_interacted = np.mean(
                        np.array(prices_interacted))
                    min_price_interacted = prices_interacted[0]
                    max_price_interacted = prices_interacted[-1]
                    user_class = 'poor' if mean_price_interacted < mean_price_click else 'rich'
                    var_price_interacted = np.var(np.array(prices_interacted))
                    distance_max_price_from_mean = max_price_interacted - mean_price_click

                support_interaction_clickout = len(prices_clickout_interacted)

                if support_interaction_clickout == 0:
                    last_price_clickout_interacted = -1
                    prices_clickout_interacted = -1
                    mean_price_clickout_interacted = -1
                    min_price_clickout_interacted = -1
                    max_price_clickout_interacted = -1
                    user_click_class = 'None'
                    var_prices_click_interacted = -1
                    distance_max_price_clickout_from_mean = -1
                else:
                    last_price_clickout_interacted = prices_clickout_interacted[
                        -1]
                    prices_clickout_interacted = sorted(
                        prices_clickout_interacted)
                    mean_price_clickout_interacted = np.mean(
                        np.array(prices_clickout_interacted))
                    min_price_clickout_interacted = prices_clickout_interacted[
                        0]
                    max_price_clickout_interacted = prices_clickout_interacted[
                        -1]
                    user_click_class = 'poor' if mean_price_clickout_interacted < mean_price_click else 'rich'
                    var_prices_click_interacted = np.var(
                        np.array(prices_clickout_interacted))
                    distance_max_price_clickout_from_mean = max_price_clickout_interacted - mean_price_click

                features_dict = {
                    'max_price_click':
                    max_price_click,
                    'min_price_click':
                    min_price_click,
                    'var_prices_click':
                    var_prices_click,
                    'support_interaction':
                    support_interaction,
                    'last_price_interacted':
                    last_price_interacted,
                    'mean_price_interacted':
                    mean_price_interacted,
                    'min_price_interacted':
                    min_price_interacted,
                    'max_price_interacted':
                    max_price_interacted,
                    'user_class':
                    user_class,
                    'var_price_interacted':
                    var_price_interacted,
                    'distance_max_price_from_mean':
                    distance_max_price_from_mean,
                    'support_interaction_clickout':
                    support_interaction_clickout,
                    'last_price_clickout_interacted':
                    last_price_clickout_interacted,
                    'mean_price_clickout_interacted':
                    mean_price_clickout_interacted,
                    'min_price_clickout_interacted':
                    min_price_clickout_interacted,
                    'max_price_clickout_interacted':
                    max_price_clickout_interacted,
                    'user_click_class':
                    user_click_class,
                    'var_prices_click_interacted':
                    var_prices_click_interacted,
                    'distance_max_price_clickout_from_mean':
                    distance_max_price_clickout_from_mean
                }

                features.append(features_dict)

                count += 1
                prices_interacted = []
                impression_interacted = {}
                prices_clickout_interacted = []

                if count < len(idxs_click):
                    last_click = idxs_click[count]
                continue

            ref = int(temp.at[i, 'reference'])

            action_type = temp.at[i, 'action_type']
            if action_type == 'clickout item':
                prices = list(map(int, temp.at[i, 'prices'].split('|')))
                impressions = list(
                    map(int, temp.at[i, 'impressions'].split('|')))
                idx = impressions.index(ref)
                prices_clickout_interacted.append(prices[idx])
                if ref not in impression_interacted:
                    impression_interacted[ref] = 1
                    prices_interacted.append(prices[idx])
            else:
                if ref not in impression_interacted:
                    impression_interacted[ref] = 1
                    if ref in price_dict:
                        prices_interacted.append(
                            price_dict[ref]['prices_mean'])
                    else:
                        pass

        final_df = temp[['user_id', 'session_id']].loc[idxs_click]
        final_df['dict'] = features

        features_df = pd.DataFrame(final_df.progress_apply(
            lambda x: tuple(x['dict'].values()), axis=1).tolist(),
                                   columns=list(final_df.iloc[0].dict.keys()))
        final_df_ = pd.merge(
            final_df.drop('dict', axis=1).reset_index(drop=True).reset_index(),
            features_df.reset_index()).drop('index', axis=1)
        return final_df_
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        temp = df.fillna('0')
        idxs_click = sorted(find_last_clickout_indices(temp))
        idxs_numeric_reference = temp[temp['reference'].str.isnumeric() ==
                                      True].index

        count = 0
        last_click = idxs_click[0]

        sess_features_dict = {}
        sess_feature = []

        for i in tqdm(sorted(idxs_numeric_reference)):
            n_user = df.at[last_click, 'user_id']
            n_sess = df.at[last_click, 'session_id']
            if i == last_click:
                impressions = list(
                    map(int, temp.at[i, 'impressions'].split('|')))

                impressions_len = len(impressions)

                num_interacted_impressions = 0
                impression_interacted = set()
                positions_interacted = []

                tuples = sorted(list(sess_features_dict.items()),
                                key=lambda t: t[0])

                min_pos = 26
                max_pos = -1
                first_pos = None
                last_pos = None
                mean_pos_interacted = -1

                for t in tuples:
                    try:
                        index = impressions.index(t[1]) + 1
                        if first_pos is None:
                            first_pos = index
                        last_pos = index
                        if index < min_pos:
                            min_pos = index
                        if index > max_pos:
                            max_pos = index
                        num_interacted_impressions += 1
                        impression_interacted.add(t[1])
                        positions_interacted.append(index)
                    except ValueError:
                        pass
                if impressions_len > 0:
                    percentage_interacted_impression = len(
                        impression_interacted) / impressions_len
                else:
                    percentage_interacted_impression = 1

                if num_interacted_impressions == 0:
                    min_pos = -1
                    max_pos = -1
                    first_pos = -1
                    last_pos = -1

                if num_interacted_impressions > 0:
                    mean_pos_interacted = sum(positions_interacted) / len(
                        positions_interacted)

                f_d = {
                    'mean_pos_interacted':
                    mean_pos_interacted,
                    'min_pos_interacted':
                    min_pos,
                    'max_pos_interacted':
                    max_pos,
                    'first_pos_interacted':
                    first_pos,
                    'last_pos_interacted':
                    last_pos,
                    'num_interacted_impressions':
                    num_interacted_impressions,
                    'percentage_interacted_impressions':
                    percentage_interacted_impression
                }
                sess_feature.append(f_d)
                sess_features_dict = {}
                count += 1
                if count < len(idxs_click):
                    last_click = idxs_click[count]
                continue

            if (temp.at[i, 'user_id'] == n_user) and (temp.at[i, 'session_id']
                                                      == n_sess):
                ref = int(temp.at[i, 'reference'])
                step_interaction = temp.at[i, 'step']
                sess_features_dict[step_interaction] = ref

        final_df = temp[['user_id', 'session_id']].loc[idxs_click]
        final_df['dict'] = sess_feature

        features_df = pd.DataFrame(final_df.progress_apply(
            lambda x: tuple(x['dict'].values()), axis=1).tolist(),
                                   columns=list(final_df.iloc[0].dict.keys()))
        final_df_ = pd.merge(
            final_df.drop('dict', axis=1).reset_index(drop=True).reset_index(),
            features_df.reset_index()).drop('index', axis=1)
        return final_df_.drop([
            'mean_pos_interacted', 'max_pos_interacted', 'last_pos_interacted'
        ],
                              axis=1)
Пример #17
0
def merge_features_tf(mode, cluster, features_array, stacking_scores_path):

    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs=find_last_clickout_indices(full_df)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    print('retrieve vali_idxs')
    # if the mode is full we don't have the validation if the mode is small or local the validation is performed
    # on the target indices

    vali_test_idxs = data.target_indices(mode, cluster)


    # construct the validation train and test df_base
    print('construct test and vali df')
    validation_test_df = click_df.loc[vali_test_idxs]

    all_idxs = click_df.index.values

    # find the differences
    print('construct train df')
    train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True)
    train_df = click_df.loc[train_idxs]

    # expand the impression as rows
    print('expand the impression')
    train_df = expand_impressions(train_df)[['user_id', 'session_id', 'item_id', 'index']]
    train_df['dummy_step']=np.arange(len(train_df))
    validation_test_df = expand_impressions(validation_test_df)[['user_id', 'session_id', 'item_id', 'index']]
    validation_test_df['dummy_step'] = np.arange(len(validation_test_df))

    # do the join
    print('join with the features')
    print(f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}')
    context_features_id = []
    for f in features_array:
        if type(f) == tuple:
            feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1])
        else:
            feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True)
        print(f'columns of the feature:\n {feature.columns}')
        print(f'NaN values are: {feature.isnull().values.sum()}')
        # if there are none fill it with -1
        feature.fillna(-1, inplace=True)
        # check if it is a feature of the impression
        if 'item_id' not in feature.columns:
            for i in range(train_df.shape[1]-6+1, train_df.shape[1]-6+1+feature.shape[1]-2, 1):
                context_features_id.append(str(i))
        print(f'session features names:{context_features_id}')
        print(f'shape of feature: {feature.shape}')
        print(f'len of feature:{len(feature)}')
        train_df = train_df.merge(feature)
        validation_test_df = validation_test_df.merge(feature)
        print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}')

    if len(stacking_scores_path)>1:
        for path in stacking_scores_path:
            score = pd.read_csv(path)
            cols = [c for c in score.columns if c in ['user_id', 'session_id', 'item_id'] or 'score' in c]
            score = score[cols]
            #if 'rnn' in path:
            score = score.groupby(['user_id', 'session_id', 'item_id'], as_index=False).last()
            train_df = train_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left')
            validation_test_df = validation_test_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left')
            print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}')

    train_df.fillna(0, inplace=True)
    validation_test_df.fillna(0, inplace=True)

    print('sorting by index and step...')
    # sort the dataframes
    train_df.sort_values(['index', 'dummy_step'], inplace=True)
    train_df.drop('dummy_step', axis=1, inplace=True
                  )
    validation_test_df.sort_values(['index', 'dummy_step'], inplace=True)
    validation_test_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return train_df, validation_test_df, np.array(context_features_id)
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()
        # reset index to correct access
        df = df.sort_values(['user_id', 'session_id', 'timestamp',
                             'step']).reset_index()

        # find the last clickout rows
        last_clickout_idxs = find_last_clickout_indices(df)
        clickout_rows = df.loc[
            last_clickout_idxs,
            ['user_id', 'session_id', 'action_type', 'impressions', 'prices']]
        # cast the impressions and the prices to lists
        clickout_rows['impression_list'] = clickout_rows.impressions.str.split(
            '|').apply(lambda x: list(map(int, x)))
        clickout_rows['price_list'] = clickout_rows.prices.str.split(
            '|').apply(lambda x: list(map(int, x)))
        clickout_rows = clickout_rows.drop('impressions', axis=1)
        # order the prices lists
        clickout_rows['sorted_price_list'] = clickout_rows.price_list.apply(
            lambda x: sorted(x))
        clickout_rows = clickout_rows.drop('prices', axis=1)

        # find the interactions with numeric reference and not last clickouts
        reference_rows = df[[
            'user_id', 'session_id', 'reference', 'action_type', 'index'
        ]]
        reference_rows = reference_rows[df.reference.str.isnumeric() ==
                                        True].astype({'reference': 'int'})
        # skip last clickouts
        reference_rows = reference_rows.loc[~reference_rows.index.
                                            isin(last_clickout_idxs)]
        reference_rows = reference_rows.drop('action_type', axis=1)
        ref_pos_series = np.ones(reference_rows.shape[0], dtype=int) * (-1)

        # iterate over the sorted reference_rows and clickout_rows
        j = 0
        clickout_indices = clickout_rows.index.values
        ckidx = clickout_indices[j]
        next_clickout_user_id = clickout_rows.at[ckidx, 'user_id']
        next_clickout_sess_id = clickout_rows.at[ckidx, 'session_id']
        k = 0
        for row in tqdm(
                zip(reference_rows.index, reference_rows.user_id,
                    reference_rows.session_id, reference_rows.reference)):
            idx = row[0]
            # if the current index is over the last clickout, break
            if idx >= clickout_indices[-1]:
                break
            # find the next clickout index
            while idx > clickout_indices[j]:
                j += 1
                ckidx = clickout_indices[j]
                next_clickout_user_id = clickout_rows.at[ckidx, 'user_id']
                next_clickout_sess_id = clickout_rows.at[ckidx, 'session_id']
                next_clickout_impress = clickout_rows.at[ckidx,
                                                         'impression_list']
                next_clickout_prices = clickout_rows.at[ckidx, 'price_list']
                next_clickout_sortedprices = clickout_rows.at[
                    ckidx, 'sorted_price_list']

            # check if row and next_clickout are in the same session
            if row[1] == next_clickout_user_id and row[
                    2] == next_clickout_sess_id:
                try:
                    ref_idx = next_clickout_impress.index(row[3])
                    ref_price = int(next_clickout_prices[ref_idx])
                    ref_pos_series[k] = next_clickout_sortedprices.index(
                        ref_price)
                except:
                    pass
            k += 1

        reference_rows['price_pos'] = ref_pos_series
        return reference_rows.drop(['user_id', 'session_id', 'reference'],
                                   axis=1).set_index('index')
Пример #19
0
    def extract_feature(self):
        train = data.train_df(self.mode, cluster=self.cluster)
        test = data.test_df(self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        idxs_click = sorted(find_last_clickout_indices(df))

        # for every last clickout index, retrieve the list
        # of all the clickouts for that session
        list_impres = []
        list_prices_impres_wise = []
        list_prices_orderd_wise = []
        for i in tqdm(idxs_click):
            a_user = df.at[i, 'user_id']
            a_sess = df.at[i, 'session_id']
            impres = [list(map(int, df.at[i, 'impressions'].split('|')))]
            prices = list(map(int, df.at[i, 'prices'].split('|')))
            prices_impres_wise = [prices]
            prices_orderd_wise = [sorted(prices)]
            j = i - 1
            while j >= 0:
                try:
                    n_user = df.at[j, 'user_id']
                    n_sess = df.at[j, 'session_id']
                    if a_sess == n_sess and a_user == n_user:
                        if df.at[j, 'action_type'] == 'clickout item':
                            impres.append(
                                list(
                                    map(int, df.at[j,
                                                   'impressions'].split('|'))))
                            prices = list(
                                map(int, df.at[j, 'prices'].split('|')))
                            prices_impres_wise.append(prices)
                            prices_orderd_wise.append(sorted(prices))
                    else:
                        break
                    j -= 1
                except:
                    j -= 1
            list_impres.append(impres)
            list_prices_impres_wise.append(prices_impres_wise)
            list_prices_orderd_wise.append(prices_orderd_wise)

        # then build the feature
        list_mean_prices_interacted = []
        list_mean_pos_interacted = []
        count = 0
        for i in tqdm(idxs_click):
            prices_interacted = []
            pos_interacted = []
            a_user = df.at[i, 'user_id']
            a_sess = df.at[i, 'session_id']
            impres = list_impres[count]
            prices_impres_wise = list_prices_impres_wise[count]
            prices_orderd_wise = list_prices_orderd_wise[count]
            j = i - 1
            while j >= 0:
                try:
                    n_user = df.at[j, 'user_id']
                    n_sess = df.at[j, 'session_id']
                    if a_sess == n_sess and a_user == n_user:
                        n_ref = df.at[j, 'reference']
                        if n_ref.isdigit():
                            n_ref = int(n_ref)
                            count_clickouts = 0
                            while True:
                                elem_impres = impres[count_clickouts]
                                elem_prices_impres_wise = prices_impres_wise[
                                    count_clickouts]
                                elem_prices_orderd_wise = prices_orderd_wise[
                                    count_clickouts]
                                if n_ref in elem_impres:
                                    price_reference = elem_prices_impres_wise[
                                        elem_impres.index(n_ref)]
                                    prices_interacted.append(price_reference)
                                    pos_interacted.append(
                                        elem_prices_orderd_wise.index(
                                            price_reference) + 1)
                                    break
                                else:
                                    count_clickouts += 1
                        j -= 1

                    else:
                        break
                except:
                    j -= 1

            if len(prices_interacted) > 0:
                list_mean_prices_interacted.append(
                    sum(prices_interacted) / len(prices_interacted))
            else:
                list_mean_prices_interacted.append(-1)

            if len(pos_interacted) > 0:
                list_mean_pos_interacted.append(
                    sum(pos_interacted) / len(pos_interacted))
            else:
                list_mean_pos_interacted.append(-1)

            count += 1

        final_df = df[['user_id', 'session_id']].loc[idxs_click]
        final_df['mean_cheap_pos_interacted'] = list_mean_pos_interacted
        final_df['mean_price_interacted'] = list_mean_prices_interacted
        return final_df.reset_index(drop=True)
def merge_features_lgb(mode, cluster, features_array):

    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs = find_last_clickout_indices(full_df)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    print('retrieve vali_idxs')
    # if the mode is full we don't have the validation if the mode is small or local the validation is performed
    # on the target indices

    vali_test_idxs = data.target_indices(mode, cluster)

    # construct the validation train and test df_base
    print('construct test and vali df')
    validation_test_df = click_df.loc[vali_test_idxs]

    all_idxs = click_df.index.values

    # find the differences
    print('construct train df')
    train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True)
    train_df = click_df.loc[train_idxs]

    # expand the impression as rows
    print('expand the impression')
    train_df = expand_impressions(train_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    train_df['dummy_step'] = np.arange(len(train_df))
    validation_test_df = expand_impressions(validation_test_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    validation_test_df['dummy_step'] = np.arange(len(validation_test_df))

    # do the join
    print('join with the features')
    print(
        f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}'
    )
    time_joins = 0
    for f in features_array:
        _feature = f(mode=mode, cluster='no_cluster')
        feature = _feature.read_feature(one_hot=False)

        print(f'shape of feature: {feature.shape}\n')
        print(f'len of feature:{len(feature)}\n')

        start = time()
        train_df = train_df.merge(feature)
        validation_test_df = validation_test_df.merge(feature)
        print(f'time to do the join: {time()-start}')
        time_joins += time() - start
        print(
            f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}'
        )

    print(f'total time to do joins: {time_joins}')

    print('sorting by index and step...')
    # sort the dataframes
    train_df.sort_values(['index', 'dummy_step'], inplace=True)
    train_df.drop('dummy_step', axis=1, inplace=True)
    validation_test_df.sort_values(['index', 'dummy_step'], inplace=True)
    validation_test_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return train_df, validation_test_df
def merge_features(mode,
                   cluster,
                   features_array,
                   onehot=True,
                   merge_kind='inner',
                   create_not_existing_features=True,
                   multithread=False):
    # load the full_df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)
    full_df = pd.concat([train_df, test_df])
    del train_df, test_df

    # retrieve the indeces of the last clikcouts
    print('find_last_click_idxs')
    last_click_idxs = find_last_clickout_indices(full_df)
    last_click_idxs = sorted(last_click_idxs)

    # filter on the found indeces obtaining only the rows of a last clickout
    print('filter full on last click idxs')
    click_df = full_df.loc[last_click_idxs].copy()

    print('retrieve vali_idxs')
    # if the mode is full we don't have the validation if the mode is small or local the validation is performed
    # on the target indices

    vali_test_idxs = data.target_indices(mode, cluster)

    # construct the validation train and test df_base
    print('construct test and vali df')
    validation_test_df = click_df.loc[vali_test_idxs]

    all_idxs = click_df.index.values

    # find the differences
    print('construct train df')
    train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True)
    train_df = click_df.loc[train_idxs]

    # expand the impression as rows
    print('expand the impression')
    train_df = expand_impressions(train_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    train_df['dummy_step'] = np.arange(len(train_df))
    validation_test_df = expand_impressions(validation_test_df)[[
        'user_id', 'session_id', 'item_id', 'index'
    ]]
    validation_test_df['dummy_step'] = np.arange(len(validation_test_df))

    if not multithread:
        train_df, validation_test_df = actual_merge_one_thread(train_df, validation_test_df, features_array, \
                                                                    mode, cluster, create_not_existing_features, merge_kind, onehot)
    else:
        train_df, validation_test_df = actual_merge_multithread(train_df, validation_test_df, features_array, \
                                                                    mode, cluster, create_not_existing_features, merge_kind, onehot)

    print('sorting by index and step...')
    # sort the dataframes
    train_df.sort_values(['index', 'dummy_step'], inplace=True)
    train_df.drop('dummy_step', axis=1, inplace=True)
    validation_test_df.sort_values(['index', 'dummy_step'], inplace=True)
    validation_test_df.drop('dummy_step', axis=1, inplace=True)

    print('after join')
    return train_df, validation_test_df, train_idxs, vali_test_idxs