def save_r_hat(self): base_save_path = f'dataset/preprocessed/{self.cluster}/{self.mode}/matrices/{self.session}/r_hat_matrices' cf.check_folder(base_save_path) print('saving r_hat...') sps.save_npz(f'{base_save_path}/{self.urm_name}_{self.name}', self.get_r_hat()) print('r_hat saved succesfully !')
def save_similarity_matrix(self): base_save_path = f'dataset/preprocessed/{self.cluster}/{self.mode}/matrices/{self.session}/similarities_matrices' cf.check_folder(base_save_path) print('saving sim_matrix...') sps.save_npz(f'{base_save_path}/{self.urm_name}_{self.name}', self.get_sim_matrix()) print('sim_matrix saved succesfully !')
def create_feature(self): # load dataset and indices train, train_indices = self.dataset.load_Xtrain(return_indices=True) test, test_indices = self.dataset.load_Xtest() # make predictions train_test = np.concatenate([train, test]) del train del test predictions = self.model.predict(train_test).flatten() # build feature df concat_indices = np.concatenate([train_indices, test_indices]) del train_indices del test_indices users_sessions = data.full_df().loc[concat_indices] feature_df = pd.DataFrame( { 'user_id': users_sessions['user_id'], 'session_id': users_sessions['session_id'], 'rnn_binary_preds': predictions }, index=concat_indices) path = 'dataset/preprocessed/no_cluster/{}/feature/rnn_binary_preds/features.csv'.format( self.mode) check_folder(path) feature_df.to_csv(path) return feature_df
def _save_dataset(base_path, mode, df): assert mode in ['train', 'vali'], 'the mode has to be train or vali' print('reducing memory usage...') df = reduce_mem_usage(df) check_folder(base_path) x = df.drop(['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) x.to_hdf(f'{_BASE_PATH}/x_{mode}.hdf', key='df', index=False, format='table') print(f'x_{mode} saved at: {_BASE_PATH}/x_{mode}.hdf') y = df['label'].values np.save(f'{_BASE_PATH}/y_{mode}', y) print(f'y_{mode} saved at: {_BASE_PATH}/y_{mode}.npy') groups = _create_groups(df) np.save(f'{_BASE_PATH}/groups_{mode}', groups) print(f'groups_{mode} saved at: {_BASE_PATH}/groups_{mode}.npy') user_session_item = df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(f'{_BASE_PATH}/user_session_item_{mode}.csv', index=False) print( f'user_session_item_{mode} saved at: {_BASE_PATH}/user_session_item_{mode}.csv' )
def __init__(self, type, mode, cluster, name): assert type in ['user', 'session'] assert mode in ['small', 'local', 'full'] self.save_path = f'dataset/preprocessed/{cluster}/{mode}/matrices/{type}' cf.check_folder(self.save_path) self.score_dict = { 'clickout item': 3, 'interaction item rating': 3, 'interaction item info': 1, 'interaction item image': 3, 'interaction item deals': 1, 'search for item': 5, 'search for destination': 'reset', 'change of sort order': None, 'filter selection': None, 'search for poi': None, 'tw': 'lin', 'score_update_rule': 'substitute' } self.name = name self.type = type self.mode = mode self.cluster = cluster self.accomodations_id = data.accomodations_ids() self.train_df = None self.test_df = None
def save_folds(df, user_session_df, train_index, test_index, count, mode): u_s_train = list( user_session_df.loc[train_index]['user_session'].values) u_s_test = list(user_session_df.loc[test_index]['user_session'].values) path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode) check_folder(path) train = df[df['user_session'].isin(u_s_train)] train = train.drop(['user_session'], axis=1) train.to_csv(os.path.join(path, 'train.csv')) train_indices = train.index.values np.save(os.path.join(path, 'train_indices'), train_indices) test = df[df['user_session'].isin(u_s_test)] target_indices = sorted(find(test)) test.at[target_indices, 'reference'] = np.nan test = test.drop(['user_session'], axis=1) test.to_csv(os.path.join(path, 'test.csv')) test_indices = test.index.values np.save(os.path.join(path, 'test_indices'), test_indices) np.save(os.path.join(path, 'target_indices'), target_indices) print(f'Train shape : {train.shape} , Test shape : {test.shape}') print(f'Last clickout indices : {len(target_indices)}')
def create_dataset(mode, cluster): # training features_array = [ ActionsInvolvingImpressionSession, ImpressionLabel, ImpressionPriceInfoSession, TimingFromLastInteractionImpression, TimesUserInteractedWithImpression, ImpressionPositionSession, LastInteractionInvolvingImpression, TimesImpressionAppearedInClickoutsSession, MeanPriceClickout, SessionLength, TimeFromLastActionBeforeClk, FrenzyFactorSession, PricePositionInfoInteractedReferences, SessionDevice, SessionFilterActiveWhenClickout, SessionSortOrderWhenClickout, ImpressionFeature ] curr_dir = Path(__file__).absolute().parent data_dir = curr_dir.joinpath( '..', 'dataset/preprocessed/{}/{}/lightGBM/'.format(cluster, mode)) print(data_dir) check_folder(str(data_dir)) train_df, test_df = merge_features(mode, cluster, features_array) if os.path.isfile(str(data_dir) + '/svmlight_train.txt'): print('Train File già presente') else: to_queries_dataset(train_df, path=str(data_dir) + '/svmlight_train.txt') if os.path.isfile(str(data_dir) + '/test.csv'): print('Test File già presente') #test_df.sort_values() to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt') else: test_df.to_csv(str(data_dir) + '/test.csv', index=False) to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt')
def create_dataset(mode, cluster): features_array = [ ImpressionLabel, ImpressionPositionSession, ScoresRNN, ScoresXGB ] train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left') train_df = train_df.replace(-1, np.nan) test_df = test_df.replace(-1, np.nan) bp = 'dataset/preprocessed/{}/{}/stacking/'.format(cluster, mode) check_folder(bp) X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) X_train = X_train.to_sparse(fill_value=0) X_train = X_train.astype(np.float64) X_train = X_train.to_coo().tocsr() save_npz(join(bp, 'X_train'), X_train) print('X_train saved') y_train = train_df[['label']] y_train.to_csv(join(bp, 'y_train.csv')) print('y_train saved') group = create_groups(train_df) print(len(group)) np.save(join(bp, 'group_train'), group) print('train groups saved') np.save(join(bp, 'train_indices'), train_idxs) print('train data completed') X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) # if mode == 'full': X_test = X_test.to_sparse(fill_value=0) X_test = X_test.astype(np.float64) X_test = X_test.to_coo().tocsr() save_npz(join(bp, 'X_test'), X_test) # else: # X_test.to_csv(join(bp, 'X_test.csv')) print('X_test saved') y_test = test_df[['label']] y_test.to_csv(join(bp, 'y_test.csv')) print('y_test saved') group = create_groups(test_df) print(len(group)) np.save(join(bp, 'group_test'), group) print('test groups saved') print('test data completed')
def save_r_hat(self): base_save_path = 'dataset/matrices/{}/r_hat_matrices'.format(self.mode) cf.check_folder(base_save_path) print('saving r_hat...') sps.save_npz('{}/{}'.format(base_save_path, self.name), self.get_r_hat()) print('r_hat saved succesfully !')
def merge_consecutive_equal_actions(): tqdm.pandas() test = data.test_df('full') test_grouped_by_session_id = test.groupby('session_id') merged = test_grouped_by_session_id.progress_apply( _merge_consecutive_equal_actions) cf.check_folder('dataset/cleaned_csv') merged.to_csv('dataset/cleaned_csv/test.csv')
def create_lightGBM_dataset(mode, cluster, features_array, dataset_name): def _create_groups(df): """ function used to retrieve the len of the groups :param df: :return: """ df = df[['user_id', 'session_id']] group = df.groupby(['user_id', 'session_id'], sort=False).apply(lambda x: len(x)).values return group def _save_dataset(base_path, mode, df): assert mode in ['train', 'vali'], 'the mode has to be train or vali' print('reducing memory usage...') df = reduce_mem_usage(df) check_folder(base_path) x = df.drop(['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) x.to_hdf(f'{_BASE_PATH}/x_{mode}.hdf', key='df', index=False, format='table') print(f'x_{mode} saved at: {_BASE_PATH}/x_{mode}.hdf') y = df['label'].values np.save(f'{_BASE_PATH}/y_{mode}', y) print(f'y_{mode} saved at: {_BASE_PATH}/y_{mode}.npy') groups = _create_groups(df) np.save(f'{_BASE_PATH}/groups_{mode}', groups) print(f'groups_{mode} saved at: {_BASE_PATH}/groups_{mode}.npy') user_session_item = df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(f'{_BASE_PATH}/user_session_item_{mode}.csv', index=False) print( f'user_session_item_{mode} saved at: {_BASE_PATH}/user_session_item_{mode}.csv' ) # base save path _BASE_PATH = f'dataset/preprocessed/lightGBM/{cluster}/{mode}/{dataset_name}' # retrieve the TRAIN and VALIDATION/TEST data train_df, validation_df = merge_features_lgb(mode, cluster, features_array) print('saving features names...') check_folder(f"{_BASE_PATH}") with open(f"{_BASE_PATH}/Features.txt", "w+") as text_file: text_file.write(str([str(fn) for fn in features_array])) Hera.send_message('SAVING TRAIN LIGHTGBM...') _save_dataset(_BASE_PATH, 'train', train_df) Hera.send_message('SAVING VALI LIGHTGBM...') _save_dataset(_BASE_PATH, 'vali', validation_df) Hera.send_message('PROCEDURE ENDED CORRECTLY')
def fit_predict(self, multithreading=True, save_folder='scores/'): if multithreading: self.scores = Parallel(backend='multiprocessing', n_jobs=-1, max_nbytes=None)(delayed(self._fit_model)(i) for i in range(5)) print(len(self.scores)) else: self.scores = [self._fit_model(i) for i in range(5)] print(len(self.scores)) model = self.model_class(mode=self.mode, cluster='no_cluster', **self.init_params) model.fit() scores_test = model.get_scores_batch() self.scores.append(scores_test) self.scores = [item for sublist in self.scores for item in sublist] scores = pd.DataFrame( self.scores, columns=['index', 'item_recommendations', 'scores']) scores = scores.sort_values(by='index') print(scores) idx_scores = set(scores['index'].values) train_full = data.train_df(mode='full', cluster='no_cluster') test_full = data.test_df(mode='full', cluster='no_cluster') full = pd.concat([train_full, test_full]) full = full[['user_id', 'session_id', 'action_type']] last_clk_full = full.loc[idx_scores] # checking that all rows are clickouts num_not_clk_row = last_clk_full[ last_clk_full['action_type'] != 'clickout item'].shape[0] print(f'Number of not clickout rows is : {num_not_clk_row}') if num_not_clk_row != 0: print("Error, some indices are not clickouts") last_clk_full = last_clk_full.drop(['action_type'], axis=1) last_clk_full['index'] = last_clk_full.index merged = last_clk_full.merge(scores, on=['index']) model_name = model.name df = assign_score(merged, self.model_name) df = df.drop(['index'], axis=1) if save_folder is not None: check_folder(save_folder) filepath = os.path.join(save_folder, model_name + '.csv.gz') print('Saving scores to', filepath, end=' ', flush=True) df.to_csv(filepath, index=False, compression='gzip') print('Done!', flush=True) return df
def get_scores_batch(self, save=False): _ = self.recommend_batch() base_path = f'dataset/preprocessed/neural_network_dataset/{self.cluster}/{self.mode}/predictions/{self.dataset_name}.pickle' check_folder.check_folder(base_path) if save: with open(base_path, 'wb') as f: pickle.dump(self.scores_batch, f) print(f'saved at: {base_path}') else: return self.scores_batch
def fit(self, epochs, early_stopping_patience=10, early_stopping_on='val_loss', mode='min'): #weights = self.class_weights if self.weight_samples else [] callbacks = [ TelegramBotKerasCallback(log_every_epochs=1, account='parro') ] # early stopping callback if isinstance(early_stopping_patience, int): assert early_stopping_patience > 0 callbacks.append( EarlyStopping(monitor=early_stopping_on, patience=early_stopping_patience, mode=mode, verbose=1, restore_best_weights=True)) # tensorboard callback if isinstance(self.tensorboard_path, str): check_folder(self.tensorboard_path) callbacks.append( TensorBoard(log_dir=self.tensorboard_path, histogram_freq=0, write_graph=False)) if self.use_generator: self.train_gen, self.val_gen = self.dataset.get_train_validation_generator( self.validation_split) #, weights) assert self.train_gen.__getitem__( 0)[0].shape[1:] == self.input_shape self.history = self.model.fit_generator( self.train_gen, epochs=epochs, validation_data=self.val_gen, callbacks=callbacks, max_queue_size=3, class_weight=self.class_weights) else: self.X, self.Y = self.dataset.load_Xtrain( ), self.dataset.load_Ytrain() self.X, self.Y = shuffle(self.X, self.Y) self.history = self.model.fit( self.X, self.Y, epochs=epochs, batch_size=self.batch_size, validation_split=self.validation_split, callbacks=callbacks, class_weight=self.class_weights, sample_weight=self.sample_weights)
def save(self, folderpath, suffix=''): """ Save the full state of the model, including: - the architecture - the weights - the training configuration (loss, optimizer) - the state of the optimizer (allowing to resume the training) See: https://keras.io/getting-started/faq/#savingloading-whole-models-architecture-weights-optimizer-state """ check_folder(folderpath) path = os.path.join(folderpath, '{}{}.h5'.format(self.name, suffix)) self.model.save(path)
def create_dataset(mode, cluster, features_array, dataset_name, stacking_scores_path): _SAVE_BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{dataset_name}' cf.check_folder(_SAVE_BASE_PATH) train_df, vali_test_df, context_features_id = merge_features_tf(mode, cluster, features_array, stacking_scores_path) # save context features id print(f'saving context feature id to: {_SAVE_BASE_PATH}/context_features_id.npy') np.save(f'{_SAVE_BASE_PATH}/context_features_id', context_features_id) parse_dataset(train_df, _SAVE_BASE_PATH, 'train') parse_dataset(vali_test_df, _SAVE_BASE_PATH, 'test') Hera.send_message('tf ranking dataset saved !') print('PROCEDURE ENDED CORRECTLY')
def fit_predict(self, dataset, fit_params={}, multithreading=True, n_jobs=-1, save_folder='scores/') -> pd.DataFrame: """ Fit and compute the scores for each fold. dataset (object): inheriting from utils.dataset.DatasetBase fit_params (dict): params to fit the model n_jobs (int): maximum number of concurrently running jobs, -1 to use all CPUs save_folder (str): folder where to save the scores """ assert hasattr(dataset, 'load_Xtrain') and hasattr(dataset, 'load_Ytrain') and hasattr(dataset, 'load_Xtest'), \ 'Dataset object must implement methods: load_Xtrain, load_Ytrain, load_Xtest' X_train, Y_train, X_test, group_train = dataset.load_Xtrain(), dataset.load_Ytrain(), \ dataset.load_Xtest(), dataset.load_group_train() # kfold kf = KFold(n_splits=self.k) # fit in each fold if multithreading: self.scores = Parallel(backend='multiprocessing', n_jobs=n_jobs)(delayed(self._fit_model) ( X_train, Y_train, group_train, train_indices, test_indices, fit_params, idx ) for idx,(train_indices,test_indices) in enumerate(kf.split(X_train, group_train)) ) else: self.scores = [self._fit_model ( X_train, Y_train, group_train, train_indices, test_indices, fit_params, idx ) for idx,(train_indices,test_indices) in enumerate(kf.split(X_train, group_train)) ] # fit in all the train and get scores for test print('fit whole train') model = self.model_class(**self.init_params) model.fit_cv(X_train, Y_train, group_train, list(range(X_train.shape[0])), [], **fit_params) print('end fit whole train') scores_test = model.get_scores_cv(X_test, None, list(range(X_test.shape[0]))) self.scores.append( scores_test ) self.scores = pd.concat(self.scores) # save scores if save_folder is not None: check_folder(save_folder) filepath = os.path.join(save_folder, model.name + '.csv.gz') print('Saving scores to', filepath, end=' ', flush=True) self.scores.to_csv(filepath, index=False, compression='gzip') print('Done!', flush=True) return self.scores
def save_features(dataset, count_chunk, target_session_id, target_user_id): # print('started onehot chunk {}'.format(count_chunk)) if len(dataset) > 0: dataset = dataset.reset_index().drop(['level_2'], axis=1) # if the algorithm is xgboost, get the onehot of all the features. otherwise leave it categorical if algo == 'xgboost': dataset = one_hot_df_column(dataset, 'device', list(poss_devices)) dataset = one_hot_df_column(dataset, 'kind_action_reference_appeared_last_time', list(poss_actions)) dataset = one_hot_df_column(dataset, 'sort_order_active_when_clickout', list(poss_sort_orders)) if 'item_id' in dataset.columns.values: dataset = dataset.drop(['item_id'], axis=1) if 'Unnamed: 0' in dataset.columns.values: dataset = dataset.drop(['Unnamed: 0'], axis=1) if algo == 'xgboost': dataset = dataset.sort_values(by=['user_id', 'session_id']) # print('started saving chunk {}'.format(count_chunk)) test = dataset[dataset['user_id'].isin( target_user_id) & dataset['session_id'].isin(target_session_id)] train = dataset[(dataset['user_id'].isin( target_user_id) & dataset['session_id'].isin(target_session_id)) == False] # fix momentaneo: ci sono alcune sessioni con stesso user_id - session_id sia in full train che in full test! if len(test[test.label == 1]) > 0: err = test[test.label == 1] user_idss = err.user_id.values session_idss = err.session_id.values test = test[~(test.user_id.isin(user_idss)) & ~(test.session_id.isin(session_idss))] if count_chunk == 1: path = 'dataset/preprocessed/{}/{}/{}/classification_train.csv'.format( cluster, mode, algo) check_folder(path) train.to_csv(path) path = 'dataset/preprocessed/{}/{}/{}/classification_test.csv'.format( cluster, mode, algo) check_folder(path) test.to_csv(path) else: with open('dataset/preprocessed/{}/{}/{}/classification_train.csv'.format(cluster, mode, algo), 'a') as f: train.to_csv(f, header=False) with open('dataset/preprocessed/{}/{}/{}/classification_test.csv'.format(cluster, mode, algo), 'a') as f: test.to_csv(f, header=False) print('chunk {} over {} completed'.format(count_chunk, math.ceil( len(session_indices)/session_to_consider_in_chunk)))
def fit(self): check_folder('models') if self.ask_to_load: if os.path.isfile('models/{}.model'.format(self.name)): if yesno_choice( 'the exact same model was yet created. want to load?' ) == 'y': self.xg.load_model('models/{}.model'.format(self.name)) return if self.class_weights: X_train, y_train, group, _, weights, _ = data.dataset_xgboost_train( mode=self.mode, cluster=self.cluster, class_weights=self.class_weights, kind=self.kind) else: X_train, y_train, group, _, _ = data.dataset_xgboost_train( mode=self.mode, cluster=self.cluster, class_weights=self.class_weights, kind=self.kind) print('data for train ready') if self.class_weights: self.xg.fit(X_train, y_train, group, sample_weight=weights) elif self.weights_position: bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format( cluster, mode, kind) w = np.load(os.path.join(bp, 'weights_position.npy')) print(w.size) print(group.shape) self.xg.fit(X_train, y_train, group, sample_weight=w) elif self.log_weights: bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format( cluster, mode, kind) w = np.load(os.path.join(bp, 'log_weights.npy')) print(w.size) print(group.shape) self.xg.fit(X_train, y_train, group, sample_weight=w) else: self.xg.fit(X_train, y_train, group) print('fit done') self.xg.save_model('models/{}.model'.format(self.name)) print('model saved')
def create_dataset(mode, cluster): features_array = [ ImpressionPositionSession, ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ImpressionLabel, LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions, SessionDevice, NumImpressionsInClickout, SessionLengthOld, TimesImpressionAppearedInClickoutsSession, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ChangeImpressionOrderPositionInSession, FrenzyFactorSession, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, TimePerImpression, PersonalizedTopPop, PriceQuality, PlatformFeaturesSimilarity, LastActionBeforeClickout, ImpressionStarsNumeric, StepsBeforeLastClickout, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PlatformReferencePercentageOfInteractions, PlatformSession, User2ItemOld, LazyUser, PastFutureSessionFeatures, SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions, ActionsInvolvingImpressionSession, SessionNumClickouts ] curr_dir = Path(__file__).absolute().parent data_dir = curr_dir.joinpath( '..', 'dataset/preprocessed/{}/{}/catboost/'.format(cluster, mode)) print(data_dir) check_folder(str(data_dir)) train_df, test_df, _, __ = merge_features( mode, cluster, features_array, merge_kind='left', onehot=False, create_not_existing_features=True) train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) train_df.to_csv(str(data_dir) + '/train.csv', index=False) #to_pool_dataset(train_df, path=str(data_dir) + '/catboost_train.txt') print('Train saved') test_df.to_csv(str(data_dir) + '/test.csv', index=False)
def save_feature(self, overwrite_if_exists=None): """ overwrite_if_exists: if true overwrite without asking; if false do not overwrite, if None ask before overwrite """ path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format( self.cluster, self.mode, self.name) if os.path.exists(path): if overwrite_if_exists == None: choice = yesno_choice( 'The feature \'{}\' already exists. Want to recreate?'. format(self.name)) if choice == 'n': return elif not overwrite_if_exists: return df = self.extract_feature() check_folder(path) df.to_csv(path, index=self.save_index)
def create_dataset_cv(mode, cluster, features_array, dataset_name, k): # create the folders for the cv split _SAVE_BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{dataset_name}' cf.check_folder(_SAVE_BASE_PATH) # retrieve the dataframe train_df, context_features_id = merge_features_tf_cv(mode, cluster, features_array) train_df.rename(columns={'index': 'qid'}, inplace=True) columns = train_df.columns # save context features id print(f'saving context feature id to: {_SAVE_BASE_PATH}/context_features_id.npy') np.save(f'{_SAVE_BASE_PATH}/context_features_id', context_features_id) # computing groups len print('retrieving groups len') groups_len = train_df[['qid', 'label']].groupby('qid').count().values.flatten() # reshape in a way that on a row we have a session reshaped_array = np.split(train_df.values, np.cumsum(groups_len)[:-1]) del train_df # compute the folds kf = KFold(k) fold = 1 for train_idxs, test_idxs in (kf.split(reshaped_array)): train_array = np.array(reshaped_array)[train_idxs] test_array = np.array(reshaped_array)[test_idxs] train_df = pd.DataFrame(np.concatenate(train_array), columns=columns) test_df = pd.DataFrame(np.concatenate(test_array), columns=columns) save_path = f'{_SAVE_BASE_PATH}/fold_{fold}' cf.check_folder(save_path) parse_dataset(train_df, save_path, 'train') parse_dataset(test_df, save_path, 'test') fold += 1
def _create_csvs(): print('creating CSV...') # create no_cluster/full path = 'dataset/preprocessed/no_cluster' full = data.full_df() train_len = data.read_config()[data.TRAIN_LEN_KEY] train = full.iloc[0:train_len] test = full.iloc[train_len:len(full)] target_indices = get_target_indices(test) check_folder('dataset/preprocessed/no_cluster/full') train.to_csv(os.path.join(path, 'full/train.csv')) test.to_csv(os.path.join(path, 'full/test.csv')) np.save(os.path.join(path, 'full/train_indices'), train.index) np.save(os.path.join(path, 'full/test_indices'), test.index) np.save(os.path.join(path, 'full/target_indices'), target_indices) no_of_rows_in_small = int( input('How many rows do you want in small.csv? ')) train_small = get_small_dataset(train, maximum_rows=no_of_rows_in_small) check_folder('dataset/preprocessed/no_cluster/small') split(train_small, os.path.join(path, 'small')) check_folder('dataset/preprocessed/no_cluster/local') split(train, os.path.join(path, 'local')) # create item_metadata in preprocess folder original_item_metadata = data.accomodations_original_df() original_item_metadata.to_csv(data.ITEMS_PATH) # append missing accomodations to item metadata append_missing_accomodations('full')
def recommend_batch(self): final_predictions = [] scores_batch = [] count = 0 for index in tqdm(self.target_indices): impr = list( map(int, data.full_df().loc[index]['impressions'].split('|'))) pred = self.predictions[count][0:len(impr)] couples = list(zip(pred, impr)) #print(couples) couples.sort(key=lambda x: x[0], reverse=True) scores, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) scores_batch.append((index, list(sorted_impr), list(scores))) count += 1 if self.mode != 'small': cf.check_folder('scores') np.save(f'scores/{self.name}', np.array(scores_batch)) return final_predictions
def save(self, mode='full', add_unused_clickouts_to_test=True): """ makes use of fit to create the dataset for a specific cluster. in particular it take cares to create a folder at the same level of base_split with the specified name and the folders structure inside """ print('Creating {} cluster...'.format(mode), end=' ', flush=True) self._fit(mode) # create cluster root folder path = f'dataset/preprocessed/{self.name}' check_folder(path) # create full and local folders full_path = os.path.join(path, mode) check_folder(full_path) train = data.train_df(mode).loc[self.train_indices] train.to_csv(os.path.join(full_path, 'train.csv')) del train # in case I specify some target_indices, I do not want to leave missing clickout not-to-predict if add_unused_clickouts_to_test & len(self.target_indices) > 0: indices_from_full = list(set(self.test_indices) - set(self.target_indices)) indices_from_test = self.target_indices test = pd.concat([data.test_df(mode).loc[indices_from_test], data.full_df().loc[indices_from_full]]) else: test = data.test_df(mode).loc[self.test_indices] test.to_csv(os.path.join(full_path, 'test.csv')) if len(self.target_indices) > 0: np.save(os.path.join(full_path, 'target_indices'), self.target_indices) else: trgt_indices = preprocess.get_target_indices(test) np.save(os.path.join(full_path, 'target_indices'), trgt_indices) del test print('Done!')
def create_features_dataframe(mode, cluster): SAVE_PATH = f'dataset/preprocessed/FFNN_dataset/dataframes/{cluster}/{mode}' check_folder.check_folder(SAVE_PATH) # load TRAIN and TEST df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) print('extracting features from TRAIN...') train_features_dataframe = train_df.groupby( ['user_id', 'session_id']).progress_apply(_extract_features) train_features_dataframe.to_csv(path_or_buf=f'{SAVE_PATH}/train_df.csv', index=False) del train_features_dataframe print('extracting features from TEST...') test_features_dataframe = test_df.groupby( ['user_id', 'session_id']).progress_apply(_extract_features, submission_mode=True) test_features_dataframe.to_csv(path_or_buf=f'{SAVE_PATH}/test_df.csv', index=False)
def create_ICM(name='icm.npz', save_path='dataset/matrices/full/'): """ it creates the ICM matrix taking as input the 'item_metadata.csv' the matrix is saved in COO format to accomplish easy conversion to csr and csc a dictionary is also saved with key = item_id and values = row of icm containing the selected item :param name: name of the icm matrix :param save_path: saving path :param post_processing: post-processing functions to call on the newly created ICM :return: """ print("creating ICM...\n") tqdm.pandas() attributes_df = data.accomodations_df() attributes_df['properties'] = attributes_df['properties'].progress_apply( lambda x: x.split('|') if isinstance(x, str) else x) attributes_df.fillna(value='', inplace=True) mlb = MultiLabelBinarizer() one_hot_attribute = mlb.fit_transform(attributes_df['properties'].values) one_hot_dataframe = pd.DataFrame(one_hot_attribute, columns=mlb.classes_) print("ICM created succesfully!\n") print("creating dictionary...\n") dict = {} item_ids = attributes_df['item_id'].values for i in tqdm(range(len(item_ids))): dict[item_ids[i]] = i print("saving ICM...\n") cf.check_folder(save_path) sps.save_npz(save_path + name, sps.coo_matrix(one_hot_dataframe.as_matrix())) print("saving dictionary") np.save(save_path + 'icm_dict.npy', dict) print("Procedure ended succesfully!")
def scores(): mode = 'full' model = interactive_model(mode) checkpoint_path = menu.checkpoint_selection( checkpoints_dir='saved_models') print('Loading {}...'.format(checkpoint_path), end='\r', flush=True) model.load(checkpoint_path) print('Done!', flush=True) # get scores for train and test and save them scores_folder = 'scores' check_folder(scores_folder) for st in ['train', 'test']: print('Building scores for {} {}...'.format(st, mode)) scores = model.get_scores_batch(scores_type=st) print('Saving scores for {} {}...'.format(st, mode)) scores_filename = '{}_scores_{}'.format(model.name, st) np.save(os.path.join(scores_folder, scores_filename), np.array(scores))
def create_dataset(mode, cluster): # training features_array = [SessionDevice, SessionSortOrderWhenClickout, PricePositionInfoInteractedReferences, SessionLength, TimeFromLastActionBeforeClk, LastActionBeforeClickout, NumInteractionsWithFirstImpression, FirstImpressionPrice, LastActionInvolvingFirstImpressions, NumImpressionsInClickout, #Platform, RNNOutput, PriceStats, PopularityFirstImpression, AvgInteractedPrice, PopularityClickoutFirstImpression, LocationReferenceFirstImpression, PlatformReferenceFirstImpression, FrenzyFactorSession, StarsRatingsFirstImpression, ActionsCountClassifier, FirstImpressionPriceInfo, SessionActionNumRefDiffFromImpressions, TimingFromLastInteractionFirstImpression, DayOfWeekAndMomentInDay, UserFeatureFirstImpression ] train_df, test_df = merge_features_classifier(mode, cluster, features_array, LabelClassification) check_folder('dataset/preprocessed/{}/{}/xgboost_classifier/'.format(cluster, mode)) train_df.to_csv('dataset/preprocessed/{}/{}/xgboost_classifier/train.csv'.format(cluster, mode), index=False) test_df.to_csv('dataset/preprocessed/{}/{}/xgboost_classifier/test.csv'.format(cluster, mode), index=False) print("Dataset created!")
def download_scores_and_sub(pattern, user_name='ubuntu'): path_to_pem = easygui.fileopenbox( msg='pick the pem', default='~/Downloads') ip = easygui.enterbox("Whats the ip?") downloads_folder = join(expanduser("~"), 'Downloads') ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(ip, username=user_name, key_filename=path_to_pem) print("transferring scores") bp = '~/recsys2019/scores/' command = 'find {} -name "*{}*.npy"'.format(bp, pattern) stdin, stdout, stderr = ssh.exec_command(command) filelist = stdout.read().splitlines() sftp = ssh.open_sftp() for afile in filelist: filename = afile.decode("utf-8") print('transferring {}'.format(filename)) check_folder(join(downloads_folder, pattern)+'/') sftp.get(filename, join(downloads_folder, pattern, filename.split('/')[-1]), progress) print("transferring subs") bp = '~/recsys2019/submissions/' command = 'find {} -name "*{}*.csv"'.format(bp, pattern) stdin, stdout, stderr = ssh.exec_command(command) filelist = stdout.read().splitlines() sftp = ssh.open_sftp() for afile in filelist: filename = afile.decode("utf-8") print('transferring {}'.format(filename)) check_folder(join(downloads_folder, pattern)+'/') sftp.get(filename, join(downloads_folder, pattern, filename.split('/')[-1]), progress) print("transferring models") bp = '~/recsys2019/models/' command = 'find {} -name "*{}*.model"'.format(bp, pattern) stdin, stdout, stderr = ssh.exec_command(command) filelist = stdout.read().splitlines() sftp = ssh.open_sftp() for afile in filelist: filename = afile.decode("utf-8") print('transferring {}'.format(filename)) check_folder(join(downloads_folder, pattern)+'/') sftp.get(filename, join(downloads_folder, pattern, filename.split('/')[-1]), progress) sftp.close()