def save_r_hat(self):
     base_save_path = f'dataset/preprocessed/{self.cluster}/{self.mode}/matrices/{self.session}/r_hat_matrices'
     cf.check_folder(base_save_path)
     print('saving r_hat...')
     sps.save_npz(f'{base_save_path}/{self.urm_name}_{self.name}',
                  self.get_r_hat())
     print('r_hat saved succesfully !')
 def save_similarity_matrix(self):
     base_save_path = f'dataset/preprocessed/{self.cluster}/{self.mode}/matrices/{self.session}/similarities_matrices'
     cf.check_folder(base_save_path)
     print('saving sim_matrix...')
     sps.save_npz(f'{base_save_path}/{self.urm_name}_{self.name}',
                  self.get_sim_matrix())
     print('sim_matrix saved succesfully !')
Exemplo n.º 3
0
    def create_feature(self):
        # load dataset and indices
        train, train_indices = self.dataset.load_Xtrain(return_indices=True)
        test, test_indices = self.dataset.load_Xtest()
        # make predictions
        train_test = np.concatenate([train, test])
        del train
        del test
        predictions = self.model.predict(train_test).flatten()
        # build feature df
        concat_indices = np.concatenate([train_indices, test_indices])
        del train_indices
        del test_indices
        users_sessions = data.full_df().loc[concat_indices]
        feature_df = pd.DataFrame(
            {
                'user_id': users_sessions['user_id'],
                'session_id': users_sessions['session_id'],
                'rnn_binary_preds': predictions
            },
            index=concat_indices)

        path = 'dataset/preprocessed/no_cluster/{}/feature/rnn_binary_preds/features.csv'.format(
            self.mode)
        check_folder(path)
        feature_df.to_csv(path)

        return feature_df
    def _save_dataset(base_path, mode, df):
        assert mode in ['train', 'vali'], 'the mode has to be train or vali'
        print('reducing memory usage...')
        df = reduce_mem_usage(df)

        check_folder(base_path)

        x = df.drop(['index', 'user_id', 'session_id', 'item_id', 'label'],
                    axis=1)
        x.to_hdf(f'{_BASE_PATH}/x_{mode}.hdf',
                 key='df',
                 index=False,
                 format='table')
        print(f'x_{mode} saved at: {_BASE_PATH}/x_{mode}.hdf')

        y = df['label'].values
        np.save(f'{_BASE_PATH}/y_{mode}', y)
        print(f'y_{mode} saved at: {_BASE_PATH}/y_{mode}.npy')

        groups = _create_groups(df)
        np.save(f'{_BASE_PATH}/groups_{mode}', groups)
        print(f'groups_{mode} saved at: {_BASE_PATH}/groups_{mode}.npy')

        user_session_item = df[['user_id', 'session_id', 'item_id']]
        user_session_item.to_csv(f'{_BASE_PATH}/user_session_item_{mode}.csv',
                                 index=False)
        print(
            f'user_session_item_{mode} saved at: {_BASE_PATH}/user_session_item_{mode}.csv'
        )
Exemplo n.º 5
0
    def __init__(self, type, mode, cluster, name):

        assert type in ['user', 'session']
        assert mode in ['small', 'local', 'full']

        self.save_path = f'dataset/preprocessed/{cluster}/{mode}/matrices/{type}'
        cf.check_folder(self.save_path)

        self.score_dict = {
            'clickout item': 3,
            'interaction item rating': 3,
            'interaction item info': 1,
            'interaction item image': 3,
            'interaction item deals': 1,
            'search for item': 5,
            'search for destination': 'reset',
            'change of sort order': None,
            'filter selection': None,
            'search for poi': None,
            'tw': 'lin',
            'score_update_rule': 'substitute'
        }

        self.name = name
        self.type = type
        self.mode = mode
        self.cluster = cluster

        self.accomodations_id = data.accomodations_ids()
        self.train_df = None
        self.test_df = None
    def save_folds(df, user_session_df, train_index, test_index, count, mode):
        u_s_train = list(
            user_session_df.loc[train_index]['user_session'].values)
        u_s_test = list(user_session_df.loc[test_index]['user_session'].values)

        path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode)
        check_folder(path)

        train = df[df['user_session'].isin(u_s_train)]
        train = train.drop(['user_session'], axis=1)
        train.to_csv(os.path.join(path, 'train.csv'))
        train_indices = train.index.values
        np.save(os.path.join(path, 'train_indices'), train_indices)

        test = df[df['user_session'].isin(u_s_test)]
        target_indices = sorted(find(test))
        test.at[target_indices, 'reference'] = np.nan
        test = test.drop(['user_session'], axis=1)
        test.to_csv(os.path.join(path, 'test.csv'))
        test_indices = test.index.values
        np.save(os.path.join(path, 'test_indices'), test_indices)
        np.save(os.path.join(path, 'target_indices'), target_indices)

        print(f'Train shape : {train.shape} , Test shape : {test.shape}')
        print(f'Last clickout indices : {len(target_indices)}')
Exemplo n.º 7
0
def create_dataset(mode, cluster):
    # training
    features_array = [
        ActionsInvolvingImpressionSession, ImpressionLabel,
        ImpressionPriceInfoSession, TimingFromLastInteractionImpression,
        TimesUserInteractedWithImpression, ImpressionPositionSession,
        LastInteractionInvolvingImpression,
        TimesImpressionAppearedInClickoutsSession, MeanPriceClickout,
        SessionLength, TimeFromLastActionBeforeClk, FrenzyFactorSession,
        PricePositionInfoInteractedReferences, SessionDevice,
        SessionFilterActiveWhenClickout, SessionSortOrderWhenClickout,
        ImpressionFeature
    ]

    curr_dir = Path(__file__).absolute().parent
    data_dir = curr_dir.joinpath(
        '..', 'dataset/preprocessed/{}/{}/lightGBM/'.format(cluster, mode))
    print(data_dir)
    check_folder(str(data_dir))

    train_df, test_df = merge_features(mode, cluster, features_array)

    if os.path.isfile(str(data_dir) + '/svmlight_train.txt'):
        print('Train File già presente')
    else:
        to_queries_dataset(train_df,
                           path=str(data_dir) + '/svmlight_train.txt')

    if os.path.isfile(str(data_dir) + '/test.csv'):
        print('Test File già presente')
        #test_df.sort_values()
        to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt')
    else:
        test_df.to_csv(str(data_dir) + '/test.csv', index=False)
        to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt')
Exemplo n.º 8
0
def create_dataset(mode, cluster):

    features_array = [
        ImpressionLabel, ImpressionPositionSession, ScoresRNN, ScoresXGB
    ]

    train_df, test_df, train_idxs, _ = merge_features(mode,
                                                      cluster,
                                                      features_array,
                                                      merge_kind='left')
    train_df = train_df.replace(-1, np.nan)
    test_df = test_df.replace(-1, np.nan)

    bp = 'dataset/preprocessed/{}/{}/stacking/'.format(cluster, mode)
    check_folder(bp)

    X_train = train_df.drop(
        ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)
    X_train = X_train.to_sparse(fill_value=0)
    X_train = X_train.astype(np.float64)
    X_train = X_train.to_coo().tocsr()
    save_npz(join(bp, 'X_train'), X_train)
    print('X_train saved')

    y_train = train_df[['label']]
    y_train.to_csv(join(bp, 'y_train.csv'))
    print('y_train saved')

    group = create_groups(train_df)
    print(len(group))
    np.save(join(bp, 'group_train'), group)
    print('train groups saved')

    np.save(join(bp, 'train_indices'), train_idxs)

    print('train data completed')

    X_test = test_df.drop(
        ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)

    # if mode == 'full':
    X_test = X_test.to_sparse(fill_value=0)
    X_test = X_test.astype(np.float64)
    X_test = X_test.to_coo().tocsr()
    save_npz(join(bp, 'X_test'), X_test)
    # else:
    #    X_test.to_csv(join(bp, 'X_test.csv'))
    print('X_test saved')

    y_test = test_df[['label']]
    y_test.to_csv(join(bp, 'y_test.csv'))
    print('y_test saved')

    group = create_groups(test_df)
    print(len(group))
    np.save(join(bp, 'group_test'), group)

    print('test groups saved')

    print('test data completed')
 def save_r_hat(self):
     base_save_path = 'dataset/matrices/{}/r_hat_matrices'.format(self.mode)
     cf.check_folder(base_save_path)
     print('saving r_hat...')
     sps.save_npz('{}/{}'.format(base_save_path, self.name),
                  self.get_r_hat())
     print('r_hat saved succesfully !')
Exemplo n.º 10
0
def merge_consecutive_equal_actions():
    tqdm.pandas()
    test = data.test_df('full')
    test_grouped_by_session_id = test.groupby('session_id')
    merged = test_grouped_by_session_id.progress_apply(
        _merge_consecutive_equal_actions)
    cf.check_folder('dataset/cleaned_csv')
    merged.to_csv('dataset/cleaned_csv/test.csv')
def create_lightGBM_dataset(mode, cluster, features_array, dataset_name):
    def _create_groups(df):
        """
        function used to retrieve the len of the groups
        :param df:
        :return:
        """
        df = df[['user_id', 'session_id']]
        group = df.groupby(['user_id', 'session_id'],
                           sort=False).apply(lambda x: len(x)).values
        return group

    def _save_dataset(base_path, mode, df):
        assert mode in ['train', 'vali'], 'the mode has to be train or vali'
        print('reducing memory usage...')
        df = reduce_mem_usage(df)

        check_folder(base_path)

        x = df.drop(['index', 'user_id', 'session_id', 'item_id', 'label'],
                    axis=1)
        x.to_hdf(f'{_BASE_PATH}/x_{mode}.hdf',
                 key='df',
                 index=False,
                 format='table')
        print(f'x_{mode} saved at: {_BASE_PATH}/x_{mode}.hdf')

        y = df['label'].values
        np.save(f'{_BASE_PATH}/y_{mode}', y)
        print(f'y_{mode} saved at: {_BASE_PATH}/y_{mode}.npy')

        groups = _create_groups(df)
        np.save(f'{_BASE_PATH}/groups_{mode}', groups)
        print(f'groups_{mode} saved at: {_BASE_PATH}/groups_{mode}.npy')

        user_session_item = df[['user_id', 'session_id', 'item_id']]
        user_session_item.to_csv(f'{_BASE_PATH}/user_session_item_{mode}.csv',
                                 index=False)
        print(
            f'user_session_item_{mode} saved at: {_BASE_PATH}/user_session_item_{mode}.csv'
        )

    # base save path
    _BASE_PATH = f'dataset/preprocessed/lightGBM/{cluster}/{mode}/{dataset_name}'

    # retrieve the TRAIN and VALIDATION/TEST data
    train_df, validation_df = merge_features_lgb(mode, cluster, features_array)

    print('saving features names...')
    check_folder(f"{_BASE_PATH}")
    with open(f"{_BASE_PATH}/Features.txt", "w+") as text_file:
        text_file.write(str([str(fn) for fn in features_array]))

    Hera.send_message('SAVING TRAIN LIGHTGBM...')
    _save_dataset(_BASE_PATH, 'train', train_df)
    Hera.send_message('SAVING VALI LIGHTGBM...')
    _save_dataset(_BASE_PATH, 'vali', validation_df)
    Hera.send_message('PROCEDURE ENDED CORRECTLY')
    def fit_predict(self, multithreading=True, save_folder='scores/'):

        if multithreading:
            self.scores = Parallel(backend='multiprocessing',
                                   n_jobs=-1,
                                   max_nbytes=None)(delayed(self._fit_model)(i)
                                                    for i in range(5))

            print(len(self.scores))
        else:
            self.scores = [self._fit_model(i) for i in range(5)]
            print(len(self.scores))

        model = self.model_class(mode=self.mode,
                                 cluster='no_cluster',
                                 **self.init_params)
        model.fit()
        scores_test = model.get_scores_batch()
        self.scores.append(scores_test)

        self.scores = [item for sublist in self.scores for item in sublist]
        scores = pd.DataFrame(
            self.scores, columns=['index', 'item_recommendations', 'scores'])
        scores = scores.sort_values(by='index')
        print(scores)
        idx_scores = set(scores['index'].values)

        train_full = data.train_df(mode='full', cluster='no_cluster')
        test_full = data.test_df(mode='full', cluster='no_cluster')
        full = pd.concat([train_full, test_full])
        full = full[['user_id', 'session_id', 'action_type']]

        last_clk_full = full.loc[idx_scores]

        # checking that all rows are clickouts
        num_not_clk_row = last_clk_full[
            last_clk_full['action_type'] != 'clickout item'].shape[0]
        print(f'Number of not clickout rows is : {num_not_clk_row}')
        if num_not_clk_row != 0:
            print("Error, some indices are not clickouts")

        last_clk_full = last_clk_full.drop(['action_type'], axis=1)

        last_clk_full['index'] = last_clk_full.index
        merged = last_clk_full.merge(scores, on=['index'])
        model_name = model.name
        df = assign_score(merged, self.model_name)
        df = df.drop(['index'], axis=1)

        if save_folder is not None:
            check_folder(save_folder)
            filepath = os.path.join(save_folder, model_name + '.csv.gz')
            print('Saving scores to', filepath, end=' ', flush=True)
            df.to_csv(filepath, index=False, compression='gzip')
            print('Done!', flush=True)

        return df
 def get_scores_batch(self, save=False):
     _ = self.recommend_batch()
     base_path = f'dataset/preprocessed/neural_network_dataset/{self.cluster}/{self.mode}/predictions/{self.dataset_name}.pickle'
     check_folder.check_folder(base_path)
     if save:
         with open(base_path, 'wb') as f:
             pickle.dump(self.scores_batch, f)
         print(f'saved at: {base_path}')
     else:
         return self.scores_batch
Exemplo n.º 14
0
    def fit(self,
            epochs,
            early_stopping_patience=10,
            early_stopping_on='val_loss',
            mode='min'):
        #weights = self.class_weights if self.weight_samples else []

        callbacks = [
            TelegramBotKerasCallback(log_every_epochs=1, account='parro')
        ]
        # early stopping callback
        if isinstance(early_stopping_patience, int):
            assert early_stopping_patience > 0
            callbacks.append(
                EarlyStopping(monitor=early_stopping_on,
                              patience=early_stopping_patience,
                              mode=mode,
                              verbose=1,
                              restore_best_weights=True))

        # tensorboard callback
        if isinstance(self.tensorboard_path, str):
            check_folder(self.tensorboard_path)
            callbacks.append(
                TensorBoard(log_dir=self.tensorboard_path,
                            histogram_freq=0,
                            write_graph=False))

        if self.use_generator:
            self.train_gen, self.val_gen = self.dataset.get_train_validation_generator(
                self.validation_split)  #, weights)
            assert self.train_gen.__getitem__(
                0)[0].shape[1:] == self.input_shape

            self.history = self.model.fit_generator(
                self.train_gen,
                epochs=epochs,
                validation_data=self.val_gen,
                callbacks=callbacks,
                max_queue_size=3,
                class_weight=self.class_weights)
        else:
            self.X, self.Y = self.dataset.load_Xtrain(
            ), self.dataset.load_Ytrain()
            self.X, self.Y = shuffle(self.X, self.Y)

            self.history = self.model.fit(
                self.X,
                self.Y,
                epochs=epochs,
                batch_size=self.batch_size,
                validation_split=self.validation_split,
                callbacks=callbacks,
                class_weight=self.class_weights,
                sample_weight=self.sample_weights)
Exemplo n.º 15
0
 def save(self, folderpath, suffix=''):
     """ Save the full state of the model, including:
     - the architecture
     - the weights
     - the training configuration (loss, optimizer)
     - the state of the optimizer (allowing to resume the training)
     See: https://keras.io/getting-started/faq/#savingloading-whole-models-architecture-weights-optimizer-state
     """
     check_folder(folderpath)
     path = os.path.join(folderpath, '{}{}.h5'.format(self.name, suffix))
     self.model.save(path)
Exemplo n.º 16
0
def create_dataset(mode, cluster, features_array, dataset_name, stacking_scores_path):
    _SAVE_BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{dataset_name}'
    cf.check_folder(_SAVE_BASE_PATH)
    train_df, vali_test_df, context_features_id = merge_features_tf(mode, cluster, features_array, stacking_scores_path)

    # save context features id
    print(f'saving context feature id to: {_SAVE_BASE_PATH}/context_features_id.npy')
    np.save(f'{_SAVE_BASE_PATH}/context_features_id', context_features_id)

    parse_dataset(train_df, _SAVE_BASE_PATH, 'train')
    parse_dataset(vali_test_df, _SAVE_BASE_PATH, 'test')
    Hera.send_message('tf ranking dataset saved !')
    print('PROCEDURE ENDED CORRECTLY')
Exemplo n.º 17
0
    def fit_predict(self, dataset, fit_params={}, multithreading=True, n_jobs=-1, save_folder='scores/') -> pd.DataFrame:
        """ Fit and compute the scores for each fold.
        dataset (object):   inheriting from utils.dataset.DatasetBase
        fit_params (dict):  params to fit the model
        n_jobs (int):       maximum number of concurrently running jobs, -1 to use all CPUs
        save_folder (str):  folder where to save the scores
        """
        assert hasattr(dataset, 'load_Xtrain') and hasattr(dataset, 'load_Ytrain') and hasattr(dataset, 'load_Xtest'), \
                    'Dataset object must implement methods: load_Xtrain, load_Ytrain, load_Xtest'
        
        X_train, Y_train, X_test, group_train = dataset.load_Xtrain(), dataset.load_Ytrain(), \
                                                dataset.load_Xtest(), dataset.load_group_train()
        
        # kfold
        kf = KFold(n_splits=self.k)
        
        # fit in each fold
        if multithreading:
            self.scores = Parallel(backend='multiprocessing', n_jobs=n_jobs)(delayed(self._fit_model)
                                (
                                    X_train, Y_train, group_train,
                                    train_indices, test_indices,
                                    fit_params, idx
                                ) for idx,(train_indices,test_indices) in enumerate(kf.split(X_train, group_train)) )
        else:
            self.scores = [self._fit_model
                                (
                                    X_train, Y_train, group_train,
                                    train_indices, test_indices,
                                    fit_params, idx
                                ) for idx,(train_indices,test_indices) in enumerate(kf.split(X_train, group_train)) ]
        
        # fit in all the train and get scores for test
        print('fit whole train')
        model = self.model_class(**self.init_params)
        model.fit_cv(X_train, Y_train, group_train, list(range(X_train.shape[0])), [], **fit_params)
        print('end fit whole train')
        scores_test = model.get_scores_cv(X_test, None, list(range(X_test.shape[0])))
        self.scores.append( scores_test )
        
        self.scores = pd.concat(self.scores)

        # save scores
        if save_folder is not None:
            check_folder(save_folder)
            filepath = os.path.join(save_folder, model.name + '.csv.gz')
            print('Saving scores to', filepath, end=' ', flush=True)
            self.scores.to_csv(filepath, index=False, compression='gzip')
            print('Done!', flush=True)
        
        return self.scores
Exemplo n.º 18
0
    def save_features(dataset, count_chunk, target_session_id, target_user_id):
        # print('started onehot chunk {}'.format(count_chunk))
        if len(dataset) > 0:
            dataset = dataset.reset_index().drop(['level_2'], axis=1)


            # if the algorithm is xgboost, get the onehot of all the features. otherwise leave it categorical
            if algo == 'xgboost':
                dataset = one_hot_df_column(dataset, 'device', list(poss_devices))
                dataset = one_hot_df_column(dataset, 'kind_action_reference_appeared_last_time', list(poss_actions))
                dataset = one_hot_df_column(dataset, 'sort_order_active_when_clickout', list(poss_sort_orders))

            if 'item_id' in dataset.columns.values:
                dataset = dataset.drop(['item_id'], axis=1)
            if 'Unnamed: 0' in dataset.columns.values:
                dataset = dataset.drop(['Unnamed: 0'], axis=1)

            if algo == 'xgboost':
                dataset = dataset.sort_values(by=['user_id', 'session_id'])

            # print('started saving chunk {}'.format(count_chunk))
            test = dataset[dataset['user_id'].isin(
                target_user_id) & dataset['session_id'].isin(target_session_id)]
            train = dataset[(dataset['user_id'].isin(
                target_user_id) & dataset['session_id'].isin(target_session_id)) == False]

            # fix momentaneo: ci sono alcune sessioni con stesso user_id - session_id sia in full train che in full test! 
            if len(test[test.label == 1]) > 0:
                err = test[test.label == 1]
                user_idss = err.user_id.values
                session_idss = err.session_id.values
                test = test[~(test.user_id.isin(user_idss)) & ~(test.session_id.isin(session_idss))]

            if count_chunk == 1:
                path = 'dataset/preprocessed/{}/{}/{}/classification_train.csv'.format(
                    cluster, mode, algo)
                check_folder(path)
                train.to_csv(path)

                path = 'dataset/preprocessed/{}/{}/{}/classification_test.csv'.format(
                    cluster, mode, algo)
                check_folder(path)
                test.to_csv(path)
            else:
                with open('dataset/preprocessed/{}/{}/{}/classification_train.csv'.format(cluster, mode, algo), 'a') as f:
                    train.to_csv(f, header=False)
                with open('dataset/preprocessed/{}/{}/{}/classification_test.csv'.format(cluster, mode, algo), 'a') as f:
                    test.to_csv(f, header=False)

        print('chunk {} over {} completed'.format(count_chunk, math.ceil(
            len(session_indices)/session_to_consider_in_chunk)))
    def fit(self):
        check_folder('models')
        if self.ask_to_load:
            if os.path.isfile('models/{}.model'.format(self.name)):
                if yesno_choice(
                        'the exact same model was yet created. want to load?'
                ) == 'y':
                    self.xg.load_model('models/{}.model'.format(self.name))
                    return

        if self.class_weights:
            X_train, y_train, group, _, weights, _ = data.dataset_xgboost_train(
                mode=self.mode,
                cluster=self.cluster,
                class_weights=self.class_weights,
                kind=self.kind)
        else:
            X_train, y_train, group, _, _ = data.dataset_xgboost_train(
                mode=self.mode,
                cluster=self.cluster,
                class_weights=self.class_weights,
                kind=self.kind)
        print('data for train ready')

        if self.class_weights:
            self.xg.fit(X_train, y_train, group, sample_weight=weights)
        elif self.weights_position:
            bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(
                cluster, mode, kind)
            w = np.load(os.path.join(bp, 'weights_position.npy'))
            print(w.size)
            print(group.shape)
            self.xg.fit(X_train, y_train, group, sample_weight=w)
        elif self.log_weights:
            bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(
                cluster, mode, kind)
            w = np.load(os.path.join(bp, 'log_weights.npy'))
            print(w.size)
            print(group.shape)
            self.xg.fit(X_train, y_train, group, sample_weight=w)
        else:
            self.xg.fit(X_train, y_train, group)

        print('fit done')
        self.xg.save_model('models/{}.model'.format(self.name))
        print('model saved')
Exemplo n.º 20
0
def create_dataset(mode, cluster):
    features_array = [
        ImpressionPositionSession, ImpressionPriceInfoSessionOld,
        ImpressionRatingNumeric, ImpressionLabel,
        LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions,
        SessionDevice, NumImpressionsInClickout, SessionLengthOld,
        TimesImpressionAppearedInClickoutsSession,
        TimesUserInteractedWithImpression, TimingFromLastInteractionImpression,
        TopPopPerImpression, TopPopInteractionClickoutPerImpression,
        ChangeImpressionOrderPositionInSession, FrenzyFactorSession,
        DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction,
        TimePerImpression, PersonalizedTopPop, PriceQuality,
        PlatformFeaturesSimilarity, LastActionBeforeClickout,
        ImpressionStarsNumeric, StepsBeforeLastClickout,
        LocationReferencePercentageOfClickouts,
        LocationReferencePercentageOfInteractions, NumTimesItemImpressed,
        PercClickPerImpressions, PlatformReferencePercentageOfClickouts,
        PlatformReferencePercentageOfInteractions, PlatformSession,
        User2ItemOld, LazyUser, PastFutureSessionFeatures,
        SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions,
        ActionsInvolvingImpressionSession, SessionNumClickouts
    ]

    curr_dir = Path(__file__).absolute().parent
    data_dir = curr_dir.joinpath(
        '..', 'dataset/preprocessed/{}/{}/catboost/'.format(cluster, mode))
    print(data_dir)
    check_folder(str(data_dir))

    train_df, test_df, _, __ = merge_features(
        mode,
        cluster,
        features_array,
        merge_kind='left',
        onehot=False,
        create_not_existing_features=True)

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    train_df.to_csv(str(data_dir) + '/train.csv', index=False)
    #to_pool_dataset(train_df, path=str(data_dir) + '/catboost_train.txt')

    print('Train saved')
    test_df.to_csv(str(data_dir) + '/test.csv', index=False)
 def save_feature(self, overwrite_if_exists=None):
     """
     overwrite_if_exists: if true overwrite without asking; if false do not overwrite, if None ask before overwrite
     """
     path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format(
         self.cluster, self.mode, self.name)
     if os.path.exists(path):
         if overwrite_if_exists == None:
             choice = yesno_choice(
                 'The feature \'{}\' already exists. Want to recreate?'.
                 format(self.name))
             if choice == 'n':
                 return
         elif not overwrite_if_exists:
             return
     df = self.extract_feature()
     check_folder(path)
     df.to_csv(path, index=self.save_index)
Exemplo n.º 22
0
def create_dataset_cv(mode, cluster, features_array, dataset_name, k):

    # create the folders for the cv split
    _SAVE_BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{dataset_name}'
    cf.check_folder(_SAVE_BASE_PATH)

    # retrieve the dataframe
    train_df, context_features_id = merge_features_tf_cv(mode, cluster, features_array)
    train_df.rename(columns={'index': 'qid'}, inplace=True)
    columns = train_df.columns

    # save context features id
    print(f'saving context feature id to: {_SAVE_BASE_PATH}/context_features_id.npy')
    np.save(f'{_SAVE_BASE_PATH}/context_features_id', context_features_id)

    # computing groups len
    print('retrieving groups len')
    groups_len = train_df[['qid', 'label']].groupby('qid').count().values.flatten()

    # reshape in a way that on a row we have a session
    reshaped_array = np.split(train_df.values, np.cumsum(groups_len)[:-1])
    del train_df

    # compute the folds
    kf = KFold(k)
    fold = 1
    for train_idxs, test_idxs in (kf.split(reshaped_array)):

        train_array = np.array(reshaped_array)[train_idxs]
        test_array = np.array(reshaped_array)[test_idxs]

        train_df = pd.DataFrame(np.concatenate(train_array), columns=columns)
        test_df = pd.DataFrame(np.concatenate(test_array), columns=columns)

        save_path = f'{_SAVE_BASE_PATH}/fold_{fold}'
        cf.check_folder(save_path)

        parse_dataset(train_df, save_path, 'train')
        parse_dataset(test_df, save_path, 'test')

        fold += 1
Exemplo n.º 23
0
    def _create_csvs():
        print('creating CSV...')

        # create no_cluster/full
        path = 'dataset/preprocessed/no_cluster'
        full = data.full_df()
        train_len = data.read_config()[data.TRAIN_LEN_KEY]

        train = full.iloc[0:train_len]
        test = full.iloc[train_len:len(full)]
        target_indices = get_target_indices(test)

        check_folder('dataset/preprocessed/no_cluster/full')
        train.to_csv(os.path.join(path, 'full/train.csv'))
        test.to_csv(os.path.join(path, 'full/test.csv'))
        np.save(os.path.join(path, 'full/train_indices'), train.index)
        np.save(os.path.join(path, 'full/test_indices'), test.index)
        np.save(os.path.join(path, 'full/target_indices'), target_indices)

        no_of_rows_in_small = int(
            input('How many rows do you want in small.csv? '))
        train_small = get_small_dataset(train,
                                        maximum_rows=no_of_rows_in_small)
        check_folder('dataset/preprocessed/no_cluster/small')
        split(train_small, os.path.join(path, 'small'))

        check_folder('dataset/preprocessed/no_cluster/local')
        split(train, os.path.join(path, 'local'))

        # create item_metadata in preprocess folder
        original_item_metadata = data.accomodations_original_df()
        original_item_metadata.to_csv(data.ITEMS_PATH)

        # append missing accomodations to item metadata
        append_missing_accomodations('full')
Exemplo n.º 24
0
    def recommend_batch(self):

        final_predictions = []
        scores_batch = []

        count = 0
        for index in tqdm(self.target_indices):
            impr = list(
                map(int,
                    data.full_df().loc[index]['impressions'].split('|')))
            pred = self.predictions[count][0:len(impr)]
            couples = list(zip(pred, impr))
            #print(couples)
            couples.sort(key=lambda x: x[0], reverse=True)
            scores, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr)))
            scores_batch.append((index, list(sorted_impr), list(scores)))
            count += 1
        if self.mode != 'small':
            cf.check_folder('scores')
            np.save(f'scores/{self.name}', np.array(scores_batch))
        return final_predictions
    def save(self, mode='full', add_unused_clickouts_to_test=True):
        """
        makes use of fit to create the dataset for a specific cluster. in particular it take cares
        to create a folder at the same level of base_split with the specified name and the
        folders structure inside 
        """
        print('Creating {} cluster...'.format(mode), end=' ', flush=True)
        self._fit(mode)

        # create cluster root folder
        path = f'dataset/preprocessed/{self.name}'
        check_folder(path)

        # create full and local folders
        full_path = os.path.join(path, mode)
        check_folder(full_path)

        train = data.train_df(mode).loc[self.train_indices]
        train.to_csv(os.path.join(full_path, 'train.csv'))
        del train

        # in case I specify some target_indices, I do not want to leave missing clickout not-to-predict
        if add_unused_clickouts_to_test & len(self.target_indices) > 0:
            indices_from_full = list(set(self.test_indices) - set(self.target_indices))
            indices_from_test = self.target_indices
            test = pd.concat([data.test_df(mode).loc[indices_from_test], data.full_df().loc[indices_from_full]])
        else:
            test = data.test_df(mode).loc[self.test_indices]

        test.to_csv(os.path.join(full_path, 'test.csv'))

        if len(self.target_indices) > 0:
            np.save(os.path.join(full_path, 'target_indices'), self.target_indices)
        else:
            trgt_indices = preprocess.get_target_indices(test)
            np.save(os.path.join(full_path, 'target_indices'), trgt_indices)
        del test

        print('Done!')
def create_features_dataframe(mode, cluster):
    SAVE_PATH = f'dataset/preprocessed/FFNN_dataset/dataframes/{cluster}/{mode}'
    check_folder.check_folder(SAVE_PATH)

    # load TRAIN and TEST df
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)

    print('extracting features from TRAIN...')
    train_features_dataframe = train_df.groupby(
        ['user_id', 'session_id']).progress_apply(_extract_features)

    train_features_dataframe.to_csv(path_or_buf=f'{SAVE_PATH}/train_df.csv',
                                    index=False)
    del train_features_dataframe

    print('extracting features from TEST...')
    test_features_dataframe = test_df.groupby(
        ['user_id', 'session_id']).progress_apply(_extract_features,
                                                  submission_mode=True)
    test_features_dataframe.to_csv(path_or_buf=f'{SAVE_PATH}/test_df.csv',
                                   index=False)
Exemplo n.º 27
0
def create_ICM(name='icm.npz', save_path='dataset/matrices/full/'):
    """
    it creates the ICM matrix taking as input the 'item_metadata.csv'
    the matrix is saved in COO format to accomplish easy conversion to csr and csc
    a dictionary is also saved with key = item_id and values = row of icm containing the selected item

    :param name: name of the icm matrix
    :param save_path: saving path
    :param post_processing: post-processing functions to call on the newly created ICM
    :return:
    """
    print("creating ICM...\n")
    tqdm.pandas()
    attributes_df = data.accomodations_df()

    attributes_df['properties'] = attributes_df['properties'].progress_apply(
        lambda x: x.split('|') if isinstance(x, str) else x)
    attributes_df.fillna(value='', inplace=True)
    mlb = MultiLabelBinarizer()
    one_hot_attribute = mlb.fit_transform(attributes_df['properties'].values)
    one_hot_dataframe = pd.DataFrame(one_hot_attribute, columns=mlb.classes_)

    print("ICM created succesfully!\n")
    print("creating dictionary...\n")
    dict = {}
    item_ids = attributes_df['item_id'].values
    for i in tqdm(range(len(item_ids))):
        dict[item_ids[i]] = i

    print("saving ICM...\n")
    cf.check_folder(save_path)
    sps.save_npz(save_path + name,
                 sps.coo_matrix(one_hot_dataframe.as_matrix()))

    print("saving dictionary")
    np.save(save_path + 'icm_dict.npy', dict)

    print("Procedure ended succesfully!")
    def scores():
        mode = 'full'
        model = interactive_model(mode)

        checkpoint_path = menu.checkpoint_selection(
            checkpoints_dir='saved_models')

        print('Loading {}...'.format(checkpoint_path), end='\r', flush=True)
        model.load(checkpoint_path)
        print('Done!', flush=True)

        # get scores for train and test and save them
        scores_folder = 'scores'
        check_folder(scores_folder)

        for st in ['train', 'test']:
            print('Building scores for {} {}...'.format(st, mode))
            scores = model.get_scores_batch(scores_type=st)

            print('Saving scores for {} {}...'.format(st, mode))
            scores_filename = '{}_scores_{}'.format(model.name, st)
            np.save(os.path.join(scores_folder, scores_filename),
                    np.array(scores))
Exemplo n.º 29
0
def create_dataset(mode, cluster):
    # training
    features_array = [SessionDevice,
                      SessionSortOrderWhenClickout,
                      PricePositionInfoInteractedReferences,
                      SessionLength,
                      TimeFromLastActionBeforeClk,
                      LastActionBeforeClickout,
                      NumInteractionsWithFirstImpression,
                      FirstImpressionPrice,
                      LastActionInvolvingFirstImpressions,
                      NumImpressionsInClickout,
                      #Platform,
                      RNNOutput,
                      PriceStats,
                      PopularityFirstImpression,
                      AvgInteractedPrice,
                      PopularityClickoutFirstImpression,
                      LocationReferenceFirstImpression,
                      PlatformReferenceFirstImpression,
                      FrenzyFactorSession,
                      StarsRatingsFirstImpression,
                      ActionsCountClassifier,
                      FirstImpressionPriceInfo,
                      SessionActionNumRefDiffFromImpressions,
                      TimingFromLastInteractionFirstImpression,
                      DayOfWeekAndMomentInDay,
                      UserFeatureFirstImpression
                      ]

    train_df, test_df = merge_features_classifier(mode, cluster, features_array, LabelClassification)
    check_folder('dataset/preprocessed/{}/{}/xgboost_classifier/'.format(cluster, mode))

    train_df.to_csv('dataset/preprocessed/{}/{}/xgboost_classifier/train.csv'.format(cluster, mode), index=False)
    test_df.to_csv('dataset/preprocessed/{}/{}/xgboost_classifier/test.csv'.format(cluster, mode), index=False)

    print("Dataset created!")
def download_scores_and_sub(pattern, user_name='ubuntu'):
    path_to_pem = easygui.fileopenbox(
        msg='pick the pem', default='~/Downloads')
    ip = easygui.enterbox("Whats the ip?")
    downloads_folder = join(expanduser("~"), 'Downloads')

    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())

    ssh.connect(ip, username=user_name, key_filename=path_to_pem)

    print("transferring scores")
    bp = '~/recsys2019/scores/'
    command = 'find {} -name "*{}*.npy"'.format(bp, pattern)
    stdin, stdout, stderr = ssh.exec_command(command)
    filelist = stdout.read().splitlines()
    sftp = ssh.open_sftp()
    for afile in filelist:
        filename = afile.decode("utf-8")
        print('transferring {}'.format(filename))
        check_folder(join(downloads_folder, pattern)+'/')
        sftp.get(filename, join(downloads_folder, pattern,
                                filename.split('/')[-1]), progress)

    print("transferring subs")
    bp = '~/recsys2019/submissions/'
    command = 'find {} -name "*{}*.csv"'.format(bp, pattern)
    stdin, stdout, stderr = ssh.exec_command(command)
    filelist = stdout.read().splitlines()
    sftp = ssh.open_sftp()
    for afile in filelist:
        filename = afile.decode("utf-8")
        print('transferring {}'.format(filename))
        check_folder(join(downloads_folder, pattern)+'/')
        sftp.get(filename, join(downloads_folder, pattern,
                                filename.split('/')[-1]), progress)

    print("transferring models")
    bp = '~/recsys2019/models/'
    command = 'find {} -name "*{}*.model"'.format(bp, pattern)
    stdin, stdout, stderr = ssh.exec_command(command)
    filelist = stdout.read().splitlines()
    sftp = ssh.open_sftp()
    for afile in filelist:
        filename = afile.decode("utf-8")
        print('transferring {}'.format(filename))
        check_folder(join(downloads_folder, pattern)+'/')
        sftp.get(filename, join(downloads_folder, pattern,
                                filename.split('/')[-1]), progress)

    sftp.close()