Пример #1
0
    def test_convert_numpy_to_libffm(self):
        """
        Test if the conversion between libffm and numpy array is correct
        """
        file = tempfile.NamedTemporaryFile(delete=True)

        # write data to libffm format
        write_data_to_xlearn_format(self.X, self.y, file.name, fields=self.fields)

        # read back data from file
        X_true, y_true, field_true = self._read_libffm_file(file.name)
        file.close()

        assert np.all(np.isclose(self.X, X_true))
        assert np.all(self.y.ravel() == y_true.ravel())
        assert np.all(self.fields.ravel() == field_true.ravel())
Пример #2
0
    def test_convert_csr_to_libsvm(self):
        """
        Test if the conversion between libsvm and csr matrix is correct
        """
        X_spase = csr_matrix(self.X)
        file = tempfile.NamedTemporaryFile(delete=True)

        # write to temporary files
        write_data_to_xlearn_format(X_spase, self.y, file.name)

        # load data back and compare if they are the same as original data
        X_true, y_true = load_svmlight_file(file.name)
        file.close()

        assert np.all(np.isclose(X_spase.todense(), X_true.todense()))
        assert np.all(self.y.ravel() == y_true.ravel())
Пример #3
0
    def test_convert_csr_to_libffm(self):
        """
        Test if the conversion between libffm and csr matrix is correct
        """
        X_sparse = csr_matrix(self.X)
        file = tempfile.NamedTemporaryFile(delete=True)

        # write data to libffm format
        write_data_to_xlearn_format(X_sparse, self.y, file.name, fields=self.fields)

        # read back data from file
        X_true, y_true, field_true = self._read_libffm_file(file.name)
        file.close()

        assert np.all(np.isclose(X_sparse.todense(), X_true))
        assert np.all(self.y.ravel() == y_true.ravel())
        assert np.all(self.fields.ravel() == field_true.ravel())
Пример #4
0
    def test_convert_numpy_to_libsvm(self):
        """
        Test if the conversion between libsvm and numpy array is correct
        """

        file = tempfile.NamedTemporaryFile(delete=False)

        # write to temporary files
        write_data_to_xlearn_format(self.X, self.y, file.name)

        # load data back and compare if they are the same as original data
        X_true, y_true = load_svmlight_file(file.name)
        file.close()
        if os.path.exists(file.name):
            os.remove(file.name)

        assert np.all(np.isclose(self.X, X_true.todense()))
        assert np.all(self.y.ravel() == y_true.ravel())
Пример #5
0
def convertion(inputpath='data/'):
    ''' Load dataset '''
    print('Load data....')
    data_x = sparse.load_npz(inputpath + 'train_x.npz')
    data_y = pd.read_csv(inputpath + 'train_y.csv', header=None)
    data_y = np.array(data_y[0])
    test_x = sparse.load_npz(inputpath + 'test_x.npz').tocsr()
    test_y = np.zeros(test_x.shape[0])
    ''' Prepare field list '''
    print('Preparing field list....')
    with open(inputpath + 'field_size.pk', 'rb') as f:
        field_size = pickle.load(f)
    field_start = [0]
    for item in field_size:
        field_start.append(field_start[-1] + item)
    field = 0
    field_list = list()
    for column in range(field_start[-1]):
        while field_start[field] <= column:
            field += 1
        field_list.append(field - 1)
    field_list = np.array(field_list).astype('int32')

    print('slice into train and evals....')
    train_x, evals_x, train_y, evals_y = train_test_split(data_x,
                                                          data_y,
                                                          test_size=0.01)
    del data_x, data_y
    gc.collect()
    print('train dataset, samples:%d' % len(train_y))
    print('evals dataset, samples:%d' % len(evals_y))

    print('start write train.txt....')
    write_data_to_xlearn_format(train_x,
                                train_y,
                                inputpath + 'train.txt',
                                fields=field_list)
    print('start write evals.txt....')
    write_data_to_xlearn_format(evals_x,
                                evals_y,
                                inputpath + 'evals.txt',
                                fields=field_list)
    print('start write test.txt....')
    write_data_to_xlearn_format(test_x,
                                test_y,
                                inputpath + 'test.txt',
                                fields=field_list)
Пример #6
0
    def recommend(self,
                  user_id_array,
                  cutoff=None,
                  remove_seen_flag=True,
                  items_to_compute=None,
                  remove_top_pop_flag=False,
                  remove_custom_items_flag=False,
                  return_scores=False):
        if np.isscalar(user_id_array):
            user_id_array = np.atleast_1d(user_id_array)
            single_user = True
        else:
            single_user = False

        if cutoff is None:
            cutoff = self.URM_train.shape[1] - 1

        n_items = self.max_items_to_predict
        items_to_recommend = self._get_items_to_recommend(
            user_id_array, n_items)
        if self.ICM_train is None and self.UCM_train is None:
            recommendation_file = os.path.join(
                self.temp_folder,
                "recommendations_{}_{}.txt".format(user_id_array[0],
                                                   user_id_array[-1]))
        else:
            recommendation_file = os.path.join(
                self.temp_folder, "recommendations_with_ICM_{}_{}.txt".format(
                    user_id_array[0], user_id_array[-1]))
        if not os.path.isfile(recommendation_file):
            fm_matrix = format_URM_slice_uncompressed(
                user_id_array, items_to_recommend, self.URM_train.shape[0],
                self.URM_train.shape[0] + self.URM_train.shape[1])

            # First ICM, then UCM just like in the creation on the training set
            if self.ICM_train is not None:
                fm_matrix = add_ICM_info(fm_matrix, self.ICM_train,
                                         self.URM_train.shape[0])
            if self.UCM_train is not None:
                fm_matrix = add_UCM_info(fm_matrix, self.UCM_train, 0)
            fm_matrix = fm_matrix[:, :]
            labels = np.ones(shape=fm_matrix.shape[0])
            write_data_to_xlearn_format(X=fm_matrix,
                                        y=labels,
                                        filepath=recommendation_file,
                                        fields=self.fields)
        self.model.setSigmoid()
        self.model.setTest(recommendation_file)

        prediction_file = os.path.join(self.model_folder, "prediction.txt")
        self.model.predict(model_path=self.model_path,
                           out_path=prediction_file)
        scores_batch = np.reshape(
            self.model.predict(model_path=self.model_path),
            newshape=(items_to_recommend.shape[0],
                      items_to_recommend.shape[1]))
        relevant_items_partition = (-scores_batch).argpartition(
            cutoff, axis=1)[:, 0:cutoff]
        relevant_items_partition_original_value = scores_batch[
            np.arange(scores_batch.shape[0])[:,
                                             None], relevant_items_partition]
        relevant_items_partition_sorting = np.argsort(
            -relevant_items_partition_original_value, axis=1)
        score_index_list = relevant_items_partition[
            np.arange(relevant_items_partition.shape[0])[:, None],
            relevant_items_partition_sorting]
        ranking_list = items_to_recommend[np.arange(scores_batch.shape[0]),
                                          np.transpose(score_index_list)].T

        if single_user:
            ranking_list = ranking_list[0]

        if return_scores:
            return ranking_list, np.empty(shape=(len(user_id_array),
                                                 self.n_items))

        else:
            return ranking_list
    # Prepare train sparse matrix and labels for dumping to file
    FM_sps_matrix = URM_FM_matrix.copy()
    labels = np.concatenate([
        np.ones(shape=URM_positive_FM_matrix.shape[0], dtype=np.int).tolist(),
        np.zeros(shape=URM_negative_FM_matrix.shape[0], dtype=np.int).tolist()
    ])

    random_state = 69420
    x_train, x_valid, y_train, y_valid = train_test_split(
        FM_sps_matrix,
        labels,
        shuffle=True,
        test_size=0.1,
        random_state=random_state)

    # Dump libffm file for train set
    print("Writing train and valid dataset in libffm format...")
    train_file_path = os.path.join(fm_data_path,
                                   "users_25_item_20_train_uncompressed.txt")
    valid_file_path = os.path.join(fm_data_path,
                                   "users_25_item_20_valid_uncompressed.txt")
    write_data_to_xlearn_format(X=x_train,
                                y=y_train,
                                fields=fields,
                                filepath=train_file_path)
    write_data_to_xlearn_format(X=x_valid,
                                y=y_valid,
                                fields=fields,
                                filepath=valid_file_path)
    print("...Writing is over.")