def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # remove last clks and last part of session new_df = remove_last_part_of_clk_sessions(df) new_df = new_df.drop(find(new_df)) no_last_clks_numeric = new_df[ new_df.reference.str.isnumeric() == True][[ 'user_id', 'session_id', 'action_type', 'reference' ]] # we want to make it fast, avoid any loops... # simply drop duplicates and mantain last occurence # of the tuple user-session-item :D last_actions = no_last_clks_numeric.drop_duplicates( ['user_id', 'session_id', 'reference'], keep='last') last_actions = last_actions.rename( columns={ 'reference': 'item_id', 'action_type': 'last_action_involving_impression' }) last_actions.item_id = last_actions.item_id.astype(int) # get last clickouts and expand last_clk = df.loc[find(df)] clk_expanded = expand_impressions(last_clk)[[ 'user_id', 'session_id', 'item_id' ]] # now simply merge and fill NaNs with 'no_action' as in the original feature feature = pd.merge(clk_expanded, last_actions, how='left', on=['user_id', 'session_id', 'item_id']) feature.last_action_involving_impression = feature.last_action_involving_impression.astype( object).fillna('no_action') return feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','action_type','impressions']] reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')] df_item_clicks = ( reference_rows .groupby(["reference"]) .size() .reset_index(name="n_interactions_per_item") ) df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) #df_item_clicks clk_expanded = expand_impressions(clickout_rows) final_feature = pd.merge(clk_expanded, df_item_clicks, how='left', on=['item_id']).fillna(0) final_feature.n_interactions_per_item = final_feature.n_interactions_per_item.astype(int) final_feature = final_feature.drop(['index'], axis=1) final_feature.reference = final_feature.reference.astype(int) new_column = [] for t in zip(final_feature.item_id, final_feature.reference, final_feature.n_interactions_per_item): if t[0] == t[1]: new_column.append(int(t[2]-1)) else: new_column.append(int(t[2])) final_feature['personalized_popularity'] = new_column final_feature_reduced = final_feature[['user_id','session_id','item_id','personalized_popularity']] return final_feature_reduced
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) temp = df.fillna('0') idxs_click = sorted(find_last_clickout_indices(temp)) idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index count = 0 last_click = idxs_click[0] impr_features = {} impr_feature = [] for i in tqdm(sorted(idxs_numeric_reference)): if i == last_click: impressions = list(map(int, temp.at[i, 'impressions'].split('|'))) click_timestamp = temp.at[i, 'timestamp'] click_step = temp.at[i, 'step'] for impr in impressions: if impr not in impr_features: impr_feature.append({'num_interactions_impr': 0, 'step_from_last_interaction': -1, 'timestamp_from_last_interaction': -1, 'last_action_type_with_impr': 'None'}) else: impr_features[impr]['timestamp_from_last_interaction'] = click_timestamp - impr_features[impr][ 'timestamp_from_last_interaction'] impr_features[impr]['step_from_last_interaction'] = click_step - impr_features[impr][ 'step_from_last_interaction'] impr_feature.append(impr_features[impr]) impr_features = {} count += 1 if count < len(idxs_click): last_click = idxs_click[count] continue ref = int(temp.at[i, 'reference']) if ref in impr_features: impr_features[ref]['num_interactions_impr'] += 1 impr_features[ref]['step_from_last_interaction'] = df.at[i, 'step'] impr_features[ref]['timestamp_from_last_interaction'] = df.at[i, 'timestamp'] impr_features[ref]['last_action_type_with_impr'] = df.at[i, 'action_type'] else: impr_features[ref] = {'num_interactions_impr': 1, 'step_from_last_interaction': df.at[i, 'step'], 'timestamp_from_last_interaction': df.at[i, 'timestamp'], 'last_action_type_with_impr': df.at[i, 'action_type']} final_df = expand_impressions(temp[['user_id', 'session_id', 'impressions']].loc[idxs_click]) print(len(final_df)) print(len(impr_feature)) final_df['dict'] = impr_feature features_df = pd.DataFrame(final_df.progress_apply(lambda x: tuple(x['dict'].values()), axis=1).tolist(), columns=list(final_df.iloc[0].dict.keys())) final_df_ = pd.concat([final_df, features_df], axis=1).drop('dict', axis=1) final_df_ = final_df_.drop(['num_interactions_impr', 'last_action_type_with_impr'], axis=1) return final_df_
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get all the cities cities = df['city'].unique().tolist() # get clickout rows (WITHOUT last clk) last_indices = find(df) df_non_last_clk = df.drop(last_indices) df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','city']] df_clickout = df_clickout.rename(columns={'reference':'item_id'}) df_clickout = df_clickout.dropna() # remove NaNs, that should not be there anywayss df_clickout.item_id = df_clickout.item_id.astype(int) # open impressions df o = ImpressionFeature(mode='small') df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1) # get all clicks properties df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) df_clicks_properties = df_clicks_properties.sort_values(by=['city']) df_clicks_properties = df_clicks_properties.drop('item_id',1) # sum all properties per city grouped_by_city = df_clicks_properties.groupby('city').sum() # create df with city:array_of_features df_city_features = pd.DataFrame(columns=['city','properties_array']) df_city_features.city = grouped_by_city.index df_city_features.properties_array = grouped_by_city.values.tolist() # now take last clk df clickout_rows = df.loc[last_indices, ['user_id','session_id','city','action_type','impressions']][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded_wt_city_feat = pd.merge(clk_expanded, df_city_features, how='left', on=['city']) # create df with item:array_of_features array = df_accomodations.drop(['item_id'],axis=1).values df_item_features = pd.DataFrame(columns=['item_id','features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) final_feature = pd.merge(clk_expanded_wt_city_feat, df_item_features, how='left', on=['item_id']) for n in tqdm(final_feature[final_feature['properties_array'].isnull()].index.tolist()): final_feature.at[n,'properties_array'] = [0]*152 # cast list to numpy array to use the cosine (it's written for doubles) final_feature.properties_array = final_feature.properties_array.progress_apply(lambda x: np.asarray(x)) # create new column new_col =[] if self.metric == 'cosine': shrink = 0 # TRY ME for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)): new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink)) if self.metric == 'euclidean': for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)): new_col.append(np.linalg.norm(t[0]-t[1])) # final feature new_feature = final_feature[['user_id','session_id','item_id']] new_feature['city_similarity'] = new_col return new_feature
def extract_feature(self): def get_pos(item, rec): res = np.empty(item.shape) for i in tqdm(range(len(item))): if str(item[i]) in rec[i]: res[i] = rec[i].index(str(item[i])) + 1 else: res[i] = -1 return res.astype(int) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) all_clk_rows = df[df.reference.str.isnumeric()==True][df.action_type == 'clickout item'] all_clk_rows = all_clk_rows [['user_id','session_id','reference','impressions']] all_clk_rows.impressions = all_clk_rows.impressions.str.split('|') pos_col = get_pos(all_clk_rows.reference.values,all_clk_rows.impressions.values) all_clk_rows = all_clk_rows.drop('impressions',1) all_clk_rows['position'] = pos_col all_clk_rows_after_1 = all_clk_rows[all_clk_rows.position>1] df_clicks_after_1 = ( all_clk_rows_after_1 .groupby(["reference"]) .size() .reset_index(name="n_clicks_per_item") ) df_clicks_after_1.reference = df_clicks_after_1.reference.astype(int) df_clicks_after_1 = df_clicks_after_1.rename(columns={'reference':'item_id'}) last_clk_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']] last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|') clk_expanded = expand_impressions(last_clk_rows) clk_expanded = clk_expanded.drop('index',1) pos_col = get_pos(clk_expanded.item_id.values,clk_expanded.imp_list.values) clk_expanded['position'] = pos_col clk_expanded = clk_expanded.drop('imp_list',1) merged = pd.merge(clk_expanded, df_clicks_after_1, how='left',on='item_id').fillna(0) new_col = [] merged.item_id = merged.item_id.astype(int) merged.reference = merged.reference.astype(int) for t in tqdm(zip(merged.item_id, merged.reference, merged.position, merged.n_clicks_per_item)): if t[0]==t[1] and t[2]>1: new_col.append(int(t[3]-1)) else: new_col.append(int(t[3])) merged['n_clicks_after_first_pos'] = new_col feature = merged[['user_id','session_id','item_id','n_clicks_after_first_pos']] return feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get ALL clickouts reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type =='clickout item')][['user_id','session_id','reference','impressions']] # get last clickout last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']] clk_expanded = expand_impressions(clickout_rows) # get the impressions impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] c = dict(Counter(big_list)) df_times_in_impressions = pd.DataFrame.from_dict(c, orient='index',columns=['number_of_times_in_impr']) df_times_in_impressions['item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex(columns = ['item_id', 'number_of_times_in_impr']) feature_times_per_imp = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0) feature_times_per_imp.number_of_times_in_impr = feature_times_per_imp.number_of_times_in_impr.astype(int) feature_times_per_imp = feature_times_per_imp[['user_id', 'session_id','item_id','number_of_times_in_impr']] df_item_clicks = ( reference_rows .groupby(["reference"]) .size() .reset_index(name="n_clickouts") ) df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0) merged.n_clickouts = merged.n_clickouts.astype(int) final_feature = pd.merge(clk_expanded, merged, how='left', on=['item_id']).fillna(0) new_col = [] final_feature.reference = final_feature.reference.astype(int) final_feature.item_id = final_feature.item_id.astype(int) for t in tqdm(zip(final_feature.reference, final_feature.item_id, final_feature.number_of_times_in_impr, final_feature.n_clickouts)): if t[0]==t[1]: # stessa reference, quindi decremento di 1 sia #click che #imp if t[2]!=1: new_col.append(round(((t[3]-1)*100)/(t[2]-1),5)) else: new_col.append(0) else: if 0 not in [t[2],t[3]] and t[2]!=1: new_col.append(round(((t[3])*100)/(t[2]-1),5)) else: new_col.append(0) final_feature['adj_perc_click_appeared'] = new_col final_feature = final_feature[['user_id','session_id','item_id','adj_perc_click_appeared']] return final_feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, [ 'user_id', 'session_id', 'platform', 'action_type', 'impressions' ]] last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[( last_clk_removed_df.reference.str.isnumeric() == True)] df_item_clicks = (reference_rows.groupby( ["reference", "platform"]).size().reset_index(name="n_interactions_per_item")) df_item_clicks = df_item_clicks.rename( columns={'reference': 'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) df_city_clicks = ( reference_rows.groupby('platform').size().reset_index( name="n_interactions_per_plat")) final_df = pd.merge(df_item_clicks, df_city_clicks, how='left', on=['platform']).fillna(0) final_df['percentage_of_total_plat_inter'] = 0.0 for t in zip(final_df.index, final_df.n_interactions_per_item, final_df.n_interactions_per_plat): percentage_of_total_plat_inter = round((t[1] * 100.0) / t[2], 2) final_df.at[ t[0], 'percentage_of_total_plat_inter'] = percentage_of_total_plat_inter feature = final_df[[ 'platform', 'item_id', 'percentage_of_total_plat_inter' ]] clk_expanded = expand_impressions(clickout_rows) feature = pd.merge(clk_expanded, feature, how='left', on=['platform', 'item_id']).fillna(0) feature = feature[[ 'user_id', 'session_id', 'item_id', 'percentage_of_total_plat_inter' ]] return feature
def convert_and_add_pos(df): df_t = expand_impressions(df) df['index'] = df.index df = pd.merge(df_t, df, how='left', on=['index', 'user_id', 'session_id', 'action_type'], suffixes=('', '_y')) df = df.drop('time_per_impression_y', axis=1) df['item_pos'] = df.apply( lambda x: (x['impression_list'].index(str(x['item_id']))) + 1, axis=1) df = df.drop(['impression_list', 'index'], axis=1) return df
def merge_features_tf_cv(mode, cluster, features_array): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs = find_last_clickout_indices(full_df) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() # expand the impression as rows print('expand the impression') click_df = expand_impressions(click_df)[['user_id', 'session_id', 'item_id', 'index']] click_df['dummy_step'] = np.arange(len(click_df)) # do the join print('join with the features') print(f'train_shape: {click_df.shape}\n') context_features_id = [] for f in features_array: if type(f) == tuple: feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1]) else: feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True) print(f'columns of the feature:\n {feature.columns}') print(f'NaN values are: {feature.isnull().values.sum()}') # if there are none fill it with -1 feature.fillna(0, inplace=True) # check if it is a feature of the impression if 'item_id' not in feature.columns: for i in range(click_df.shape[1] - 6 + 1, click_df.shape[1] - 6 + 1 + feature.shape[1] - 2, 1): context_features_id.append(str(i)) print(f'session features names:{context_features_id}') print(f'shape of feature: {feature.shape}') print(f'len of feature:{len(feature)}') click_df = click_df.merge(feature) print(f'train_shape: {click_df.shape}\n ') print('sorting by index and step...') # sort the dataframes click_df.sort_values(['index', 'dummy_step'], inplace=True) click_df.drop('dummy_step', axis=1, inplace=True) print('after join') return click_df, np.array(context_features_id)
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # preprocess needed df = df.sort_values( by=['user_id', 'session_id', 'timestamp', 'step']).reset_index( drop=True) df = remove_last_part_of_clk_sessions(df) # compute number of interactions per session df_int = df[df.action_type == 'interaction item image'][[ 'user_id', 'session_id', 'timestamp', 'step', 'action_type' ]] feature = (df_int.groupby( ['user_id', 'session_id']).size().reset_index(name='num_inter_item_image')) # compute session length sess_size = (df.groupby(['user_id', 'session_id' ]).size().reset_index(name='session_length')) # get clk rows and expand clickout_rows = df.loc[ find(df), ['user_id', 'session_id', 'action_type', 'impressions']][ df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows).drop( ['index', 'action_type'], 1) # merge final_feature = pd.merge(clk_expanded, feature, how='left', on=['user_id', 'session_id']).fillna(0) final_feature.num_inter_item_image = final_feature.num_inter_item_image.astype( int) final_feature = pd.merge(final_feature, sess_size, how='left', on=['user_id', 'session_id']).fillna(0) final_feature.session_length = final_feature.session_length.astype(int) # compute the percentage perc = [] for t in tqdm( zip(final_feature.num_inter_item_image, final_feature.session_length)): perc.append((t[0] * 100) / t[1]) final_feature['perc_inter_item_image'] = perc return final_feature[[ 'user_id', 'session_id', 'item_id', 'perc_inter_item_image' ]]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # preprocess needed df = df.sort_values( by=['user_id', 'session_id', 'timestamp', 'step']).reset_index( drop=True) df = remove_last_part_of_clk_sessions(df) sess_not_numeric_interactions = ( df[df.reference.str.isnumeric() != True][[ 'user_id', 'session_id', 'timestamp', 'step' ]].groupby([ 'user_id', 'session_id' ]).size().reset_index(name='num_not_numeric_interactions')) sess_size = (df.groupby(['user_id', 'session_id' ]).size().reset_index(name='session_length')) clickout_rows = df.loc[ find(df), ['user_id', 'session_id', 'action_type', 'impressions']][ df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows).drop('index', 1) feature = pd.merge(clk_expanded, sess_not_numeric_interactions, how='left', on=['user_id', 'session_id']).fillna(0) feature.num_not_numeric_interactions = feature.num_not_numeric_interactions.astype( int) feature = pd.merge(feature, sess_size, how='left', on=['user_id', 'session_id']).fillna(0) feature.session_length = feature.session_length.astype(int) perc = [] for t in tqdm( zip(feature.num_not_numeric_interactions, feature.session_length)): perc.append((t[0] * 100) / t[1]) feature['perc_not_numeric'] = perc return feature[[ 'user_id', 'session_id', 'item_id', 'perc_not_numeric' ]]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) if self.mode in ['small', 'local']: print('reinserting clickout') test = test.groupby(['session_id', 'user_id']).progress_apply(_reinsert_clickout) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) df = df.loc[idxs_click][[ 'user_id', 'session_id', 'reference', 'impressions' ]] df = expand_impressions(df) df['label'] = (df['item_id'] == df['reference'].astype('float')) * 1 df.drop(['index', 'reference'], axis=1, inplace=True) print(df) return df
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[ (last_clk_removed_df.reference.str.isnumeric() == True) & (last_clk_removed_df.action_type == 'clickout item')][[ 'user_id', 'session_id', 'reference', 'impressions' ]] clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_rows) impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] # flatten multi dim list in 1-dim list :) c = dict(Counter( big_list)) # count occurrence of each item_id in the impressions df_times_in_impressions = pd.DataFrame.from_dict( c, orient='index', columns=['num_times_item_impressed']) df_times_in_impressions[ 'item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex( columns=['item_id', 'num_times_item_impressed']) df_times_in_impressions = df_times_in_impressions.sort_values( by=['item_id']).reset_index(drop=True) feature = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0) feature.num_times_item_impressed = feature.num_times_item_impressed.astype( int) return feature[[ 'user_id', 'session_id', 'item_id', 'num_times_item_impressed' ]]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get clickout rows clickout_rows = df.loc[find(df), ['user_id', 'session_id', 'impressions']][ df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows).drop(['index'], 1) # get position new_col = [] curr_u = clk_expanded.loc[0, 'user_id'] curr_s = clk_expanded.loc[0, 'session_id'] pos = 0 for t in tqdm(zip(clk_expanded.user_id, clk_expanded.session_id)): if t[0] == curr_u and t[1] == curr_s: pos += 1 else: pos = 1 curr_u = t[0] curr_s = t[1] new_col.append(pos) clk_expanded['position'] = new_col # get impression count for each session imp_count = (clk_expanded.groupby( ['user_id', 'session_id']).size().reset_index(name='num_impressions')) # merge and compute percentage feature = pd.merge(clk_expanded, imp_count, how='left', on=['user_id', 'session_id']).fillna(0) pos_perc = [] for t in tqdm(zip(feature.position, feature.num_impressions)): pos_perc.append((t[0] * 100) / t[1]) feature['impression_position_in_percentage'] = pos_perc return feature[[ 'user_id', 'session_id', 'item_id', 'impression_position_in_percentage' ]]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_rows) o = ImpressionFeature(mode=self.mode) f = o.read_feature(True) # get the accomodation's df feature_stars = f[[ 'item_id', 'properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star' ]] # remap the name feature_stars = feature_stars.rename( columns={ 'properties1 Star': '1', 'properties2 Star': '2', 'properties3 Star': '3', 'properties4 Star': '4', 'properties5 Star': '5' }) # set default 0 Stars for those for which the feature is missing feature_stars['0'] = pd.Series(np.ones(len(feature_stars), dtype=np.uint8), index=feature_stars.index) feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1', '0']].idxmax(axis=1) feature_stars_restricted = feature_stars[['item_id', 'stars']] final_feature = pd.merge(clk_expanded, feature_stars_restricted, how='left', on=['item_id']).fillna(1) final_feature['stars'] = final_feature['stars'].astype(int) final_feature['stars'] = final_feature['stars'].replace(0, -1) return final_feature[['user_id', 'session_id', 'item_id', 'stars']]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) df = df.loc[idxs_click][['user_id', 'session_id', 'impressions']] df = expand_impressions(df) # initialize the session id session_id = '' count = 1 impression_position = [] for i in tqdm(df.index): c_session = df.at[i, 'session_id'] if c_session != session_id: session_id = c_session count = 1 impression_position.append(count) count += 1 df['impression_position'] = impression_position df['impression_position'] = pd.to_numeric(df['impression_position']) df.drop('index', axis=1, inplace=True) return df
def merge_features(mode, cluster, features_array, onehot=True, merge_kind='inner', create_not_existing_features=True, multithread=False): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs = find_last_clickout_indices(full_df) last_click_idxs = sorted(last_click_idxs) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() print('retrieve vali_idxs') # if the mode is full we don't have the validation if the mode is small or local the validation is performed # on the target indices vali_test_idxs = data.target_indices(mode, cluster) # construct the validation train and test df_base print('construct test and vali df') validation_test_df = click_df.loc[vali_test_idxs] all_idxs = click_df.index.values # find the differences print('construct train df') train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True) train_df = click_df.loc[train_idxs] # expand the impression as rows print('expand the impression') train_df = expand_impressions(train_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] train_df['dummy_step'] = np.arange(len(train_df)) validation_test_df = expand_impressions(validation_test_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] validation_test_df['dummy_step'] = np.arange(len(validation_test_df)) if not multithread: train_df, validation_test_df = actual_merge_one_thread(train_df, validation_test_df, features_array, \ mode, cluster, create_not_existing_features, merge_kind, onehot) else: train_df, validation_test_df = actual_merge_multithread(train_df, validation_test_df, features_array, \ mode, cluster, create_not_existing_features, merge_kind, onehot) print('sorting by index and step...') # sort the dataframes train_df.sort_values(['index', 'dummy_step'], inplace=True) train_df.drop('dummy_step', axis=1, inplace=True) validation_test_df.sort_values(['index', 'dummy_step'], inplace=True) validation_test_df.drop('dummy_step', axis=1, inplace=True) print('after join') return train_df, validation_test_df, train_idxs, vali_test_idxs
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get clk rows last_clickout_indices = find(df) clickout_rows = df.loc[ last_clickout_indices, ['user_id', 'session_id', 'impressions', 'prices']] clk_expanded = expand_impressions(clickout_rows).drop('index', 1) # open item metadata in one hot o = ImpressionFeature(mode=self.mode) df_accomodations = o.read_feature(True) # get the stars feature_stars = df_accomodations[[ 'item_id', 'properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star' ]] # remap the name feature_stars = feature_stars.rename( columns={ 'properties1 Star': '1', 'properties2 Star': '2', 'properties3 Star': '3', 'properties4 Star': '4', 'properties5 Star': '5' }) # set default 0 Stars for those for which the feature is missing feature_stars['0'] = pd.Series(np.ones(len(feature_stars), dtype=np.uint8), index=feature_stars.index) feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1', '0']].idxmax(axis=1) feature_stars_restricted = feature_stars[['item_id', 'stars']] f_stars = pd.merge(clk_expanded, feature_stars_restricted, how='left', on=['item_id']) f_stars['stars'] = f_stars['stars'].astype(int) # get the ratings f_ratings = df_accomodations[[ 'item_id', 'propertiesExcellent Rating', 'propertiesVery Good Rating', 'propertiesGood Rating', 'propertiesSatisfactory Rating', ]] f_ratings['propertiesNo Rating'] = pd.Series(np.ones(len(f_ratings), dtype=np.uint8), index=f_ratings.index) df = f_ratings.iloc[:, 1:] df['fake'] = pd.Series(np.zeros(len(df), dtype=np.uint8), index=df.index) cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df.reindex(columns=cols) dff = df.diff(axis=1).drop(['fake'], axis=1) dff = dff.astype(int) dff.columns = [5, 4, 3, 2, 1] f_ratings = f_ratings.drop(f_ratings.columns[1:], axis=1) f_ratings['rating'] = dff.idxmax(axis=1) f_ratings = pd.merge(f_ratings, feature_stars_restricted, how='left', on=['item_id']) df_clk_rat_star = pd.merge(clk_expanded, f_ratings, how='left', on='item_id') # expand prices df_clk_rat_star.prices = df_clk_rat_star.prices.str.split('|') curr_user = '******' curr_sess = '_' pos = 0 price_expanded = [] for t in tqdm( zip(df_clk_rat_star.user_id, df_clk_rat_star.session_id, df_clk_rat_star.prices)): #check if in session if curr_user != t[0] or curr_sess != t[1]: pos = 0 curr_user = t[0] curr_sess = t[1] else: pos += 1 price_expanded.append(t[2][pos]) df_clk_rat_star['price'] = price_expanded df_clk_rat_star = df_clk_rat_star.drop(['prices'], 1) df_clk_rat_star.stars = df_clk_rat_star.stars.astype(int) # fills missing stars values with the mean avg = df_clk_rat_star[['user_id', 'session_id', 'stars']] avg = avg.loc[avg.stars != 0] # va calcolata la media solo sui non zero avg = pd.DataFrame( avg.groupby(['user_id', 'session_id'])['stars'].progress_apply( lambda x: int(x.sum() / x.size))).fillna(0) avg = avg.rename(columns={'stars': 'stars_avg'}) avg.stars = avg.stars_avg.astype(int) no_stars = df_clk_rat_star.loc[df_clk_rat_star.stars == 0, ['user_id', 'session_id', 'item_id']] stars_filled = pd.merge(no_stars, avg, how='left', on=['user_id', 'session_id']).fillna(0) stars_filled.stars = stars_filled.stars_avg.astype(int) df_clk_rat_star_filled = pd.merge( df_clk_rat_star, stars_filled, how='left', on=['user_id', 'session_id', 'item_id']) for t in zip(df_clk_rat_star_filled.stars, df_clk_rat_star_filled.stars_avg, df_clk_rat_star_filled.index): if t[0] == 0: df_clk_rat_star_filled.at[t[2], 'stars'] = t[1] df_clk_rat_star_filled = df_clk_rat_star_filled.drop('stars_avg', 1) # now fill missing values for rating avg = df_clk_rat_star_filled[['user_id', 'session_id', 'rating']] avg.rating = avg.rating.astype(int) avg = avg.loc[avg.rating != 1] # va calcolata la media solo sui non zero avg = pd.DataFrame( avg.groupby(['user_id', 'session_id'])['rating'].progress_apply( lambda x: int(x.sum() / x.size))).fillna(0) avg = avg.rename(columns={'rating': 'rating_avg'}) avg.rating = avg.rating_avg.astype(int) no_rat = df_clk_rat_star.loc[df_clk_rat_star.rating == 1, ['user_id', 'session_id', 'item_id']] rat_filled = pd.merge(no_rat, avg, how='left', on=['user_id', 'session_id']).fillna(0) rat_filled.rating = rat_filled.rating_avg.astype(int) df_clk_rat_star_rat_filled = pd.merge( df_clk_rat_star_filled, rat_filled, how='left', on=['user_id', 'session_id', 'item_id']) for t in zip(df_clk_rat_star_rat_filled.rating, df_clk_rat_star_rat_filled.rating_avg, df_clk_rat_star_rat_filled.index): if t[0] == 1: df_clk_rat_star_rat_filled.at[t[2], 'rating'] = t[1] df_clk_rat_star_rat_filled = df_clk_rat_star_rat_filled.drop( 'rating_avg', 1) # add feature column new_col = [] df_clk_rat_star_rat_filled.rating = df_clk_rat_star_rat_filled.rating.astype( int) df_clk_rat_star_rat_filled.stars = df_clk_rat_star_rat_filled.stars.astype( int) df_clk_rat_star_rat_filled.price = df_clk_rat_star_rat_filled.price.astype( int) for t in tqdm( zip(df_clk_rat_star_rat_filled.rating, df_clk_rat_star_rat_filled.stars, df_clk_rat_star_rat_filled.price)): new_col.append((1.5 * t[0] + t[1]) / t[2]) df_clk_rat_star_rat_filled['price_quality'] = new_col final_feature = df_clk_rat_star_rat_filled[[ 'user_id', 'session_id', 'item_id', 'price_quality' ]] return final_feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get only non-last-clickout clickout rows last_clickout_indices = find(df) last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[ (last_clk_removed_df.reference.str.isnumeric() == True) & (last_clk_removed_df.action_type == 'clickout item')][[ 'user_id', 'session_id', 'reference', 'impressions' ]] # get the impressions impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] # convert multi-dim list in 1-dim list c = dict( Counter(big_list) ) # count occurence of each accomodation in the impression list # create df from dictonary: for each accomodation tells the number of times it appears in impressions df_times_in_impressions = pd.DataFrame.from_dict( c, orient='index', columns=['number_of_times_in_impr']) df_times_in_impressions[ 'item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex( columns=['item_id', 'number_of_times_in_impr']) # get number of times an accomodation has been clicked df_item_clicks = (reference_rows.groupby( ["reference"]).size().reset_index(name="n_clickouts")) df_item_clicks = df_item_clicks.rename( columns={'reference': 'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) # merge the two df merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0) merged.n_clickouts = merged.n_clickouts.astype(int) merged['perc_click_appeared'] = round( (merged.n_clickouts * 100) / (merged.number_of_times_in_impr), 2) # create the feature for each item feature_per_item = merged[['item_id', 'perc_click_appeared']] # use the feature for each last clickout clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_rows) final_feature = pd.merge(clk_expanded, feature_per_item, how='left', on=['item_id']).fillna(0) final_feature = final_feature[[ 'user_id', 'session_id', 'item_id', 'perc_click_appeared' ]] return final_feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get last clickout rows last_clickout_indices = find(df) clickout_rows = df.loc[ last_clickout_indices, ['user_id', 'session_id', 'city', 'reference', 'impressions']][ df.action_type == 'clickout item'] # get reference rows WITH last clickout reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')] # compute popularity WITH last clickout df_item_clicks = (reference_rows.groupby( ["reference", "city"]).size().reset_index(name="n_interactions_per_item")) df_item_clicks = df_item_clicks.rename( columns={'reference': 'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) df_city_clicks = (reference_rows.groupby('city').size().reset_index( name="n_interactions_per_city")) # merge clickout rows expanded with the popularity dataframes merged_df = pd.merge(df_item_clicks, df_city_clicks, how='left', on=['city']).fillna(0) clk_expanded = expand_impressions(clickout_rows) feature = pd.merge(clk_expanded, merged_df, how='left', on=['item_id', 'city']).fillna(0) # compute the percentage of clicks per platfom new_col = [] feature.reference = feature.reference.astype(int) feature.item_id = feature.item_id.astype(int) for t in tqdm( zip(feature.reference, feature.item_id, feature.n_interactions_per_item, feature.n_interactions_per_city)): if t[0] == t[1]: # è quello cliccato if t[3] != 1: percentage_of_total_city_clk = round( ((t[2] - 1) * 100.0) / (t[3] - 1), 5) else: percentage_of_total_city_clk = 0 else: # non è quello cliccato if 0 not in [t[2], t[3]] and t[3] != 1: percentage_of_total_city_clk = round( (t[2] * 100.0) / (t[3] - 1), 5) # tolgo comunque il click per plat else: percentage_of_total_city_clk = 0 new_col.append(percentage_of_total_city_clk) feature['adj_percentage_of_total_city_clk'] = new_col feature.adj_percentage_of_total_city_clk = feature.adj_percentage_of_total_city_clk.astype( float) final_feature_reduced = feature[[ 'user_id', 'session_id', 'item_id', 'adj_percentage_of_total_city_clk' ]] return final_feature_reduced
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # first step: get all the platforms platforms = sorted(df.platform.unique().tolist()) # create df that for each plat will hold the feature vector df_plat_feature = pd.DataFrame( columns=['platform', 'properties_array']) df_plat_feature['platform'] = platforms # remove last clickouts and do some preprocessing last_indices = find(df) df_clickout = df[(df.reference.str.isnumeric() == True) & (df['action_type'] == 'clickout item')][[ 'reference', 'platform' ]] df_clickout = df_clickout.rename(columns={'reference': 'item_id'}) df_clickout.item_id = df_clickout.item_id.astype(int) # get the item metadata in one hot o = ImpressionFeature(mode=self.mode) df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop([ 'properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star' ], 1) # merge clickouts dataframe with the metadata df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) # extract the one hot econded feature into a 1-dim numpy array array = df_accomodations.drop(['item_id'], axis=1).values # for each item append the features as numpy array df_item_features = pd.DataFrame(columns=['item_id', 'features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) # for each column compute the sum of all the clickout-rows' features new_col = [] # which will hold the platform feature vector for p in tqdm(platforms): df_clicks_properties_per_plat = df_clicks_properties[ df_clicks_properties.platform == p] df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop( ['item_id', 'platform'], axis=1) df_sum = df_clicks_properties_per_plat.sum() # questo if serve perché ci sono plat che non compaiono nei clickout # per quelle metto un vettore di 0 if df_clicks_properties_per_plat.shape[0] != 0: df_sum = df_sum.apply( lambda x: x / df_clicks_properties_per_plat.shape[0]) plat_feature = df_sum.values else: plat_feature = np.asarray( [0] * df_clicks_properties_per_plat.shape[1]) new_col.append(plat_feature) df_plat_feature['properties_array'] = new_col # now take the last clickout rows and expand on the impression list clickout_rows = df.loc[last_indices, [ 'user_id', 'session_id', 'platform', 'action_type', 'impressions' ]][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded = clk_expanded.drop(['index', 'action_type'], axis=1) # for each impression, add the feature vector of the platform and the feature vector of the impression clk_expanded_wt_plat_feat = pd.merge(clk_expanded, df_plat_feature, how='left', on=['platform']) final_feature = pd.merge(clk_expanded_wt_plat_feat, df_item_features, how='left', on=['item_id']) # compute the similarity between the impression's feature vector and the plat feature vector new_col = [] if self.metric == 'cosine': shrink = 5 # TRY ME for t in tqdm( zip(final_feature.properties_array, final_feature.features_array)): new_col.append( cosine_similarity(t[0].astype(np.double), t[1].astype(np.double), shrink)) final_feature = final_feature[['user_id', 'session_id', 'item_id']] final_feature['adj_platform_features_similarity'] = new_col return final_feature
def merge_features_tf(mode, cluster, features_array, stacking_scores_path): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs=find_last_clickout_indices(full_df) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() print('retrieve vali_idxs') # if the mode is full we don't have the validation if the mode is small or local the validation is performed # on the target indices vali_test_idxs = data.target_indices(mode, cluster) # construct the validation train and test df_base print('construct test and vali df') validation_test_df = click_df.loc[vali_test_idxs] all_idxs = click_df.index.values # find the differences print('construct train df') train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True) train_df = click_df.loc[train_idxs] # expand the impression as rows print('expand the impression') train_df = expand_impressions(train_df)[['user_id', 'session_id', 'item_id', 'index']] train_df['dummy_step']=np.arange(len(train_df)) validation_test_df = expand_impressions(validation_test_df)[['user_id', 'session_id', 'item_id', 'index']] validation_test_df['dummy_step'] = np.arange(len(validation_test_df)) # do the join print('join with the features') print(f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}') context_features_id = [] for f in features_array: if type(f) == tuple: feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1]) else: feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True) print(f'columns of the feature:\n {feature.columns}') print(f'NaN values are: {feature.isnull().values.sum()}') # if there are none fill it with -1 feature.fillna(-1, inplace=True) # check if it is a feature of the impression if 'item_id' not in feature.columns: for i in range(train_df.shape[1]-6+1, train_df.shape[1]-6+1+feature.shape[1]-2, 1): context_features_id.append(str(i)) print(f'session features names:{context_features_id}') print(f'shape of feature: {feature.shape}') print(f'len of feature:{len(feature)}') train_df = train_df.merge(feature) validation_test_df = validation_test_df.merge(feature) print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}') if len(stacking_scores_path)>1: for path in stacking_scores_path: score = pd.read_csv(path) cols = [c for c in score.columns if c in ['user_id', 'session_id', 'item_id'] or 'score' in c] score = score[cols] #if 'rnn' in path: score = score.groupby(['user_id', 'session_id', 'item_id'], as_index=False).last() train_df = train_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left') validation_test_df = validation_test_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left') print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}') train_df.fillna(0, inplace=True) validation_test_df.fillna(0, inplace=True) print('sorting by index and step...') # sort the dataframes train_df.sort_values(['index', 'dummy_step'], inplace=True) train_df.drop('dummy_step', axis=1, inplace=True ) validation_test_df.sort_values(['index', 'dummy_step'], inplace=True) validation_test_df.drop('dummy_step', axis=1, inplace=True) print('after join') return train_df, validation_test_df, np.array(context_features_id)
def extract_feature(self): def get_pos(item, rec): res = np.empty(item.shape) for i in tqdm(range(len(item))): if str(item[i]) in rec[i]: res[i] = rec[i].index(str(item[i])) + 1 else: res[i] = -1 return res.astype(int) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get ALL the clk rows with also last clickouts all_clk_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')][[ 'user_id', 'session_id', 'reference', 'impressions' ]] all_clk_rows.impressions = all_clk_rows.impressions.str.split('|') # add the position pos_col = get_pos(all_clk_rows.reference.values, all_clk_rows.impressions.values) all_clk_rows['position'] = pos_col all_clk_rows = all_clk_rows.drop('impressions', 1) # compute the popularity for each cluster df_clicks_1_pos = (all_clk_rows[all_clk_rows.position == 1].groupby( "reference").size().reset_index(name="pop_1_pos")) df_clicks_1_pos.reference = df_clicks_1_pos.reference.astype(int) df_clicks_1_pos = df_clicks_1_pos.rename( columns={'reference': 'item_id'}) # pos 2 to 5 df_clicks_2to5_pos = (all_clk_rows[(all_clk_rows.position > 1) & (all_clk_rows.position <= 5)]. groupby("reference").size().reset_index( name="pop_2to5_pos")) df_clicks_2to5_pos.reference = df_clicks_2to5_pos.reference.astype(int) df_clicks_2to5_pos = df_clicks_2to5_pos.rename( columns={'reference': 'item_id'}) # pos 6 to 10 df_clicks_6to10_pos = (all_clk_rows[(all_clk_rows.position > 5) & (all_clk_rows.position <= 10)]. groupby("reference").size().reset_index( name="pop_6to10_pos")) df_clicks_6to10_pos.reference = df_clicks_6to10_pos.reference.astype( int) df_clicks_6to10_pos = df_clicks_6to10_pos.rename( columns={'reference': 'item_id'}) # pos 11 to 15 df_clicks_11to15_pos = (all_clk_rows[(all_clk_rows.position > 10) & (all_clk_rows.position <= 15)]. groupby("reference").size().reset_index( name="pop_11to15_pos")) df_clicks_11to15_pos.reference = df_clicks_11to15_pos.reference.astype( int) df_clicks_11to15_pos = df_clicks_11to15_pos.rename( columns={'reference': 'item_id'}) # pos 16 to 25 df_clicks_16to25_pos = (all_clk_rows[(all_clk_rows.position > 15) & (all_clk_rows.position <= 25)]. groupby("reference").size().reset_index( name="pop_16to25_pos")) df_clicks_16to25_pos.reference = df_clicks_16to25_pos.reference.astype( int) df_clicks_16to25_pos = df_clicks_16to25_pos.rename( columns={'reference': 'item_id'}) # now merge with the last clickouts expanded last_clickout_indices = find(df) last_clk_rows = df.loc[ last_clickout_indices, ['user_id', 'session_id', 'reference', 'impressions']] last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|') clk_expanded = expand_impressions(last_clk_rows) clk_expanded = clk_expanded.drop('index', 1) # add position pos_col = get_pos(clk_expanded.item_id.values, clk_expanded.imp_list.values) clk_expanded['position'] = pos_col clk_expanded = clk_expanded.drop('imp_list', 1) # merge :) merged = pd.merge(clk_expanded, df_clicks_1_pos, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_clicks_2to5_pos, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_clicks_6to10_pos, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_clicks_11to15_pos, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_clicks_16to25_pos, how='left', on='item_id').fillna(0) # CIAO PICCIO # add column of popularity per cluster new_col = [] for t in tqdm( zip(merged.position, merged.pop_1_pos, merged.pop_2to5_pos, merged.pop_6to10_pos, merged.pop_11to15_pos, merged.pop_16to25_pos)): if t[0] == 1: new_col.append(t[1]) elif 1 < t[0] <= 5: new_col.append(t[2]) elif 5 < t[0] <= 10: new_col.append(t[3]) elif 10 < t[0] <= 15: new_col.append(t[4]) elif 15 < t[0] <= 25: new_col.append(t[5]) merged['pop_per_pos'] = new_col merged = merged.drop([ 'pop_1_pos', 'pop_2to5_pos', 'pop_6to10_pos', 'pop_11to15_pos', 'pop_16to25_pos' ], axis=1) # now compute the number of time that each item is impressed for each cluster position all_clks = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')][[ 'user_id', 'session_id', 'impressions' ]] all_clks['imp_list'] = all_clks.impressions.str.split('|') all_clk_rows_expanded = expand_impressions(all_clks) pos_col = get_pos(all_clk_rows_expanded.item_id.values, all_clk_rows_expanded.imp_list.values) all_clk_rows_expanded['position'] = pos_col # first pos all_clk_rows_expanded = all_clk_rows_expanded[[ 'user_id', 'session_id', 'item_id', 'position' ]] df_impressions_1 = ( all_clk_rows_expanded[all_clk_rows_expanded.position == 1].groupby( ["item_id"]).size().reset_index(name="n_times_in_position_1")) df_impressions_1.reference = df_impressions_1.item_id.astype(int) # pos 2 to 5 df_impressions_2to5 = (all_clk_rows_expanded[ (all_clk_rows_expanded.position > 1) & (all_clk_rows_expanded.position <= 5)].groupby([ "item_id" ]).size().reset_index(name="n_times_in_position_2to5")) df_impressions_2to5.reference = df_impressions_2to5.item_id.astype(int) # pos 6 to 10 df_impressions_6to10 = (all_clk_rows_expanded[ (all_clk_rows_expanded.position > 5) & (all_clk_rows_expanded.position <= 10)].groupby([ "item_id" ]).size().reset_index(name="n_times_in_position_6to10")) df_impressions_6to10.reference = df_impressions_6to10.item_id.astype( int) # pos 11 to 15 df_impressions_11to15 = (all_clk_rows_expanded[ (all_clk_rows_expanded.position > 10) & (all_clk_rows_expanded.position <= 15)].groupby([ "item_id" ]).size().reset_index(name="n_times_in_position_11to15")) df_impressions_11to15.reference = df_impressions_11to15.item_id.astype( int) # pos 16 to 25 df_impressions_16to25 = (all_clk_rows_expanded[ (all_clk_rows_expanded.position > 15) & (all_clk_rows_expanded.position <= 25)].groupby([ "item_id" ]).size().reset_index(name="n_times_in_position_16to25")) df_impressions_16to25.reference = df_impressions_16to25.item_id.astype( int) # merge with the expanded last clickouts merged = pd.merge(merged, df_impressions_1, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_impressions_2to5, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_impressions_6to10, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_impressions_11to15, how='left', on='item_id').fillna(0) merged = pd.merge(merged, df_impressions_16to25, how='left', on='item_id').fillna(0) # add the new column new_col = [] for t in tqdm( zip(merged.position, merged.n_times_in_position_1, merged.n_times_in_position_2to5, merged.n_times_in_position_6to10, merged.n_times_in_position_11to15, merged.n_times_in_position_16to25)): if t[0] == 1: new_col.append(t[1]) elif 1 < t[0] <= 5: new_col.append(t[2]) elif 5 < t[0] <= 10: new_col.append(t[3]) elif 10 < t[0] <= 15: new_col.append(t[4]) elif 15 < t[0] <= 25: new_col.append(t[5]) # <3 merged['n_times_impr'] = new_col merged = merged.drop([ 'n_times_in_position_1', 'n_times_in_position_2to5', 'n_times_in_position_6to10', 'n_times_in_position_11to15', 'n_times_in_position_16to25' ], 1) # now compute the feature, remembering: # - subtract 1 to the popularity for the clicked items # - subtract 1 to each impression position (bc the number of times is calculated on the whole dataset) new_col = [] merged.reference = merged.reference.astype(int) merged.item_id = merged.item_id.astype(int) for t in tqdm( zip(merged.reference, merged.item_id, merged.pop_per_pos, merged.n_times_impr)): if t[3] > 1: if t[0] == t[1]: new_col.append(((t[2] - 1) * 100) / (t[3] - 1)) else: new_col.append(((t[2]) * 100) / (t[3] - 1)) else: new_col.append(0) merged['perc_click_per_pos'] = new_col return merged[[ 'user_id', 'session_id', 'item_id', 'perc_click_per_pos' ]]
def extract_feature(self): """ Computes all user features. Must distinsuish between past sessions and future sessions, and for each compute same features. This will help understand the moves of the user through the impressions """ train_df = data.train_df(mode=self.mode, cluster=self.cluster) test_df = data.test_df(mode=self.mode, cluster=self.cluster) test_df = test_df.fillna(0) df = pd.concat([train_df, test_df]) print('Adjusting session bastarde ...') df_to_correct = df[df.session_id.isin(session_bastarde)] df = df[~df.session_id.isin(session_bastarde)] for i in tqdm(df_to_correct.index): if df_to_correct.at[i, 'step'] > dict_sess_bast[df_to_correct.at[i, 'session_id']]: df_to_correct = df_to_correct.drop(i, axis=0) df = pd.concat([df, df_to_correct]) df.sort_values(by=['user_id', 'session_id', 'timestamp'], inplace=True) df = df.reset_index(drop=True) i = 0 idxs_click = find_last_clickout(df) users = df.user_id.values pbar = tqdm(total=len(idxs_click)) # I will need a copy when iterating later idx_to_compute = idxs_click.copy() while i < idxs_click[-1]: initial_i = i user = df.at[i, 'user_id'] # Get all user sessions indices for u in users[i:]: if u != user: break i += 1 # Now i start creating the features for every session sessions_user_idxs = [] while len(idx_to_compute) > 0 and idx_to_compute[0] < i: sessions_user_idxs += [idx_to_compute.pop(0)] sessions_count = len(sessions_user_idxs) if sessions_count > 1: # Start computing features: keeping the last clickouts where to iterate for getting features user_sessions_df = df.iloc[sessions_user_idxs, :] df_only_user = df.iloc[initial_i:i, :] df_only_user = df_only_user.reset_index(drop=True) # Iterating over clickouts to predict of session: computing features for idx, row in user_sessions_df.iterrows(): curr_session = row.session_id # Get a session, get the impressions impressions = list(map(str, row.impressions.split('|'))) df_samecity = df_only_user # [df_only_user.city == row.city] idx = list(df_samecity.session_id.values).index(curr_session) # Get index of df where considered session starts and ends idx_session_initial = idx idx_session_final = len(df_samecity) - list(df_samecity.session_id.values)[::-1].index(curr_session) if df_samecity.index.values[0] < idx_session_initial: temp_df = df_samecity.iloc[0:idx_session_initial, :] self.compute_past_sessions_feat( temp_df[temp_df.city == row.city], impressions, int(df_only_user.at[idx_session_initial, 'timestamp'])) else: self.add_empty_features(impressions, 'past') tm_clk = int(row['timestamp']) df_samecity = df_samecity.iloc[idx_session_final:len(df_samecity), :] df_samecity = df_samecity[df_samecity.city == row.city] if len(df_samecity) > 0: self.compute_future_sessions_feat(df_samecity, impressions, tm_clk) else: self.add_empty_features(impressions, 'future') else: # Return all features -1, if at least a session exists if sessions_count == 1: # Case one session for one user: clk_idx = sessions_user_idxs[0] impressions = df.at[clk_idx, 'impressions'].split('|') self.add_empty_features(impressions, 'both') pbar.update(sessions_count) pbar.close() df = expand_impressions(df.iloc[idxs_click, :][['user_id', 'session_id', 'reference', 'impressions']]) for key in self.features.keys(): print(key, len(self.features[key])) df[key] = self.features[key] print('Correcting feature: add duplicate sessions with underscore...') label_feat = pd.read_csv('dataset/preprocessed/{}/{}/feature/impression_label/features.csv'.format(self.cluster, self.mode)) df = self.adjust_features(df, label_feat) df.drop(['index', 'reference'], axis=1, inplace=True) return df
def extract_feature(self): ######### READING DATA train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) ######## SOME PREPROCESS + SECONDARY DATA STRUCTURE TO SPEED UP PERFOMANCES clickout_indices = find(df) clickout_df = df.loc[clickout_indices] clickout_sessions = list(clickout_df.session_id) session_to_impressions = dict() user_to_sessions = dict() session_to_timestamp = dict() for t in tqdm(zip(clickout_df.session_id, clickout_df.impressions, clickout_df.user_id, clickout_df.timestamp)): if t[2] not in user_to_sessions: user_to_sessions[t[2]] = list() user_to_sessions[t[2]] += [t[0]] session_to_impressions[t[0]] = list(map(int, t[1].split("|"))) session_to_timestamp[t[0]] = t[3] # Cleaning df from clickout sessions and not numeric reference (i.e Castellammare di Stabbia, NA) and from users # which are not in the test set clean_df = df[(~df.session_id.isin(clickout_sessions)) & (df.reference.apply(lambda x: type(x) == str and x.isdigit())) \ & (df.user_id.isin(user_to_sessions.keys()))] clean_df["reference"] = pd.to_numeric(clean_df["reference"]) grouped = clean_df.groupby("user_id") session_to_df = dict() for name, group in tqdm(grouped, desc="Scanning users and create enriched sessions dataframe"): group = group.sort_values("timestamp") sessions = user_to_sessions[name] #Attach to each session a small df containing only the rows useful for the computation of the feature for s in sessions: imps = session_to_impressions[s] temp = group[group.reference.isin(imps)] session_to_df[s] = temp print(len(session_to_df)) #### FEATURE KERNEL # Action mapping on indices # 0 -> time_from_last_interaction # Action <-> index of the array time_last_interaction_past = 0 action_dict_past = { "search for item" : 1, "interaction item image" : 2, "interaction item info" : 3, "interaction item deals" : 4, "interaction item rating" : 5, "clickout item" : 6} time_first_interaction_future = 7 action_dict_future = { "search for item" : 8, "interaction item image" : 9, "interaction item info" : 10, "interaction item deals" :11, "interaction item rating" : 12, "clickout item" : 13} imp_to_actions = dict() session_to_feature = dict() for k, v in tqdm(session_to_timestamp.items(), desc="Scanning sessions to generate feature"): # if not, we don't have any information from past or future, from that user, for that impressions if k in session_to_df: temp = session_to_df[k] past = temp[temp.timestamp <= v].sort_values("timestamp") future = temp[temp.timestamp > v].sort_values("timestamp", ascending=False) imps = session_to_impressions[k] imp_to_actions = dict() for i in imps: # + 2 due to "time_from_last_interaction", both past and future imp_to_actions[i] = np.zeros(len(action_dict_past) + len(action_dict_future) + 2) imp_to_actions[i][time_last_interaction_past] = -1 imp_to_actions[i][time_first_interaction_future] = -1 for t in zip(past.reference, past.action_type, past.timestamp): imp = t[0] action_index = action_dict_past[t[1]] imp_to_actions[imp][time_last_interaction_past] = v - t[2] imp_to_actions[imp][action_index] += 1 for t in zip(future.reference, future.action_type, future.timestamp): imp = t[0] action_index = action_dict_future[t[1]] imp_to_actions[imp][time_first_interaction_future] = t[2] - v imp_to_actions[imp][action_index] += 1 session_to_feature[k] = imp_to_actions #### UNROLLING DICT TO DATAFRAME lines = list() for k, v in tqdm(session_to_feature.items(), desc="Dicts to dataframe"): for imp, feature in v.items(): lines.append([k] + [imp] + list(feature)) new_df = pd.DataFrame(lines, columns=["session_id", "item_id", "time_last_past_interaction", "search_for_item_past", "interaction_item_image_past", "interaction_item_info_past", "interaction_item_deals_past", "interaction_item_rating_past", "clickout_item_past", "time_first_future_interaction", "search_for_item_future", "interaction_item_image_future", "interaction_item_info_future", "interaction_item_deals_future", "interaction_item_rating_future", "clickout_item_future"]) #### MERGING WITH MAIN DATAFRAME clickout_df = clickout_df[['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_df) print("Temp feature (only rows not null) shape: {}".format(new_df.shape)) print("Expanded dataframe shape: {}".format(clk_expanded.shape)) feature = pd.merge(clk_expanded, new_df, how="left") feature[["search_for_item_past","interaction_item_image_past", "interaction_item_info_past", "interaction_item_deals_past", "interaction_item_rating_past", "clickout_item_past", "search_for_item_future", "interaction_item_image_future", "interaction_item_info_future", "interaction_item_deals_future", "interaction_item_rating_future", "clickout_item_future"]] = \ feature[["search_for_item_past","interaction_item_image_past", "interaction_item_info_past", "interaction_item_deals_past", "interaction_item_rating_past", "clickout_item_past", "search_for_item_future", "interaction_item_image_future", "interaction_item_info_future", "interaction_item_deals_future", "interaction_item_rating_future", "clickout_item_future"]].fillna(value=0) feature.replace(-1, np.nan) print("Final feature shape: {}".format(feature.shape)) return feature
def merge_features_lgb(mode, cluster, features_array): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs = find_last_clickout_indices(full_df) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() print('retrieve vali_idxs') # if the mode is full we don't have the validation if the mode is small or local the validation is performed # on the target indices vali_test_idxs = data.target_indices(mode, cluster) # construct the validation train and test df_base print('construct test and vali df') validation_test_df = click_df.loc[vali_test_idxs] all_idxs = click_df.index.values # find the differences print('construct train df') train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True) train_df = click_df.loc[train_idxs] # expand the impression as rows print('expand the impression') train_df = expand_impressions(train_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] train_df['dummy_step'] = np.arange(len(train_df)) validation_test_df = expand_impressions(validation_test_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] validation_test_df['dummy_step'] = np.arange(len(validation_test_df)) # do the join print('join with the features') print( f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}' ) time_joins = 0 for f in features_array: _feature = f(mode=mode, cluster='no_cluster') feature = _feature.read_feature(one_hot=False) print(f'shape of feature: {feature.shape}\n') print(f'len of feature:{len(feature)}\n') start = time() train_df = train_df.merge(feature) validation_test_df = validation_test_df.merge(feature) print(f'time to do the join: {time()-start}') time_joins += time() - start print( f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}' ) print(f'total time to do joins: {time_joins}') print('sorting by index and step...') # sort the dataframes train_df.sort_values(['index', 'dummy_step'], inplace=True) train_df.drop('dummy_step', axis=1, inplace=True) validation_test_df.sort_values(['index', 'dummy_step'], inplace=True) validation_test_df.drop('dummy_step', axis=1, inplace=True) print('after join') return train_df, validation_test_df
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) platforms = df['platform'].unique().tolist() df_plat_feature = pd.DataFrame(columns=['platform','properties_array']) df_plat_feature['platform'] = platforms last_indices = find(df) df_non_last_clk = df.drop(last_indices) df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','platform']] df_clickout = df_clickout.rename(columns={'reference':'item_id'}) df_clickout = df_clickout.dropna() # remove NaNs df_clickout.item_id = df_clickout.item_id.astype(int) o = ImpressionFeature(mode=self.mode) df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1) df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) array = df_accomodations.drop(['item_id'],axis=1).values df_item_features = pd.DataFrame(columns=['item_id','features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) new_col = [] for p in tqdm(platforms): df_clicks_properties_per_plat = df_clicks_properties[df_clicks_properties.platform == p] df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop(['item_id','platform'], axis=1) df_sum = df_clicks_properties_per_plat.sum() if df_clicks_properties_per_plat.shape[0] !=0: # questo vuol dire che appare almeno una volta la plat plat_feature = df_sum.values else: plat_feature = np.asarray([0]*df_clicks_properties_per_plat.shape[1]) new_col.append(plat_feature) df_plat_feature['properties_array'] = new_col global_sum = df_clicks_properties.drop(['item_id','platform'],1) global_sum = global_sum.sum().tolist() df_plat_feature['global_properties'] = df_plat_feature.apply(lambda x: global_sum, axis=1) properties_globally_normalized = [] for t in tqdm(zip(df_plat_feature.properties_array, df_plat_feature.global_properties)): properties_globally_normalized.append(np.asarray([x/y for x,y in zip(t[0],t[1])])) df_plat_feature['properties_globally_normalized'] = properties_globally_normalized df_plat_feature = df_plat_feature.drop(['properties_array','global_properties'],1) # ora prendo il dataframe coi clickout solito last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','platform','action_type','impressions']][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded = clk_expanded.drop(['index','action_type'], axis = 1) clk_expanded_wt_plat_feat = pd.merge(clk_expanded, df_plat_feature, how='left', on=['platform']).astype(object) clk_expanded_wt_plat_feat.item_id = clk_expanded_wt_plat_feat.item_id.astype(int) final_feature = pd.merge(clk_expanded_wt_plat_feat, df_item_features, how='left', on=['item_id']) new_col =[] shrink = 0 # TRY ME for t in tqdm(zip(final_feature.properties_globally_normalized, final_feature.features_array)): new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink)) new_feature = final_feature[['user_id','session_id','item_id']] new_feature['platform_similarity_normalized'] = new_col return new_feature