示例#1
0
    def get_S_incremental_and_set_W(self):

        self.S_incremental = self.cython_epoch.get_S()

        if self.train_with_sparse_weights:
            self.W_sparse = self.S_incremental
            self.W_sparse = check_matrix(self.W_sparse, format='csr')
        else:
            self.W_sparse = similarityMatrixTopK(self.S_incremental,
                                                 k=self.top_k)
            self.W_sparse = check_matrix(self.W_sparse, format='csr')
    def applyPearsonCorrelation(self):
        """
        Remove from every data point the average for the corresponding column
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        interactionsPerCol = np.diff(self.dataMatrix.indptr)

        nonzeroCols = interactionsPerCol > 0
        sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()

        colAverage = np.zeros_like(sumPerCol)
        colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[
            nonzeroCols]

        # Split in blocks to avoid duplicating the whole data structure
        start_col = 0
        end_col = 0

        blockSize = 1000

        while end_col < self.n_columns:
            end_col = min(self.n_columns, end_col + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
                np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])

            start_col += blockSize
    def applyAdjustedCosine(self):
        """
        Remove from every data point the average for the corresponding row
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csr')

        interactionsPerRow = np.diff(self.dataMatrix.indptr)

        nonzeroRows = interactionsPerRow > 0
        sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()

        rowAverage = np.zeros_like(sumPerRow)
        rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[
            nonzeroRows]

        # Split in blocks to avoid duplicating the whole data structure
        start_row = 0
        end_row = 0

        blockSize = 1000

        while end_row < self.n_rows:
            end_row = min(self.n_rows, end_row + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
                np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])

            start_row += blockSize
    def fit(self,
            urm_train,
            alpha=0.41417,
            beta=0.04995,
            top_k=54,
            min_rating=0,
            implicit=True,
            normalize_similarity=True,
            save_matrix=False,
            load_matrix=False):

        self.urm_train = urm_train

        if not load_matrix:
            self.alpha = alpha
            self.beta = beta
            self.min_rating = min_rating
            self.top_k = top_k
            self.implicit = implicit
            self.normalize_similarity = normalize_similarity

            if self.min_rating > 0:
                self.urm_train.data[self.urm_train.data < self.min_rating] = 0
                self.urm_train.eliminate_zeros()
                if self.implicit:
                    self.urm_train.data = np.ones(self.urm_train.data.size,
                                                  dtype=np.float32)

            # p_ui is the row-normalized urm
            p_ui = normalize(self.urm_train, norm='l1', axis=1)

            # p_iu is the column-normalized, "boolean" urm transposed
            x_bool = self.urm_train.transpose(copy=True)
            x_bool.data = np.ones(x_bool.data.size, np.float32)

            # Taking the degree of each item to penalize top popular
            # Some rows might be zero, make sure their degree remains zero
            x_bool_sum = np.array(x_bool.sum(axis=1)).ravel()

            degree = np.zeros(self.urm_train.shape[1])

            non_zero_mask = x_bool_sum != 0.0

            degree[non_zero_mask] = np.power(x_bool_sum[non_zero_mask],
                                             -self.beta)

            # ATTENTION: axis is still 1 because i transposed before the normalization
            p_iu = normalize(x_bool, norm='l1', axis=1)
            del x_bool

            # alpha power
            if self.alpha != 1.:
                p_ui = p_ui.power(self.alpha)
                p_iu = p_iu.power(self.alpha)

            # Final matrix is computed as p_ui * p_iu * p_ui
            # Multiplication unpacked for memory usage reasons
            block_dim = 200
            d_t = p_iu

            # Use array as it reduces memory requirements compared to lists
            data_block = 10000000

            rows = np.zeros(data_block, dtype=np.int32)
            cols = np.zeros(data_block, dtype=np.int32)
            values = np.zeros(data_block, dtype=np.float32)

            num_cells = 0

            start_time = time.time()
            start_time_print_batch = start_time

            for current_block_start_row in range(0, p_ui.shape[1], block_dim):

                if current_block_start_row + block_dim > p_ui.shape[1]:
                    block_dim = p_ui.shape[1] - current_block_start_row

                similarity_block = d_t[
                    current_block_start_row:current_block_start_row +
                    block_dim, :] * p_ui
                similarity_block = similarity_block.toarray()

                for row_in_block in range(block_dim):
                    row_data = np.multiply(similarity_block[row_in_block, :],
                                           degree)
                    row_data[current_block_start_row + row_in_block] = 0

                    best = row_data.argsort()[::-1][:self.top_k]

                    not_zeros_mask = row_data[best] != 0.0

                    values_to_add = row_data[best][not_zeros_mask]
                    cols_to_add = best[not_zeros_mask]

                    for index in range(len(values_to_add)):

                        if num_cells == len(rows):
                            rows = np.concatenate(
                                (rows, np.zeros(data_block, dtype=np.int32)))
                            cols = np.concatenate(
                                (cols, np.zeros(data_block, dtype=np.int32)))
                            values = np.concatenate(
                                (values, np.zeros(data_block,
                                                  dtype=np.float32)))

                        rows[
                            num_cells] = current_block_start_row + row_in_block
                        cols[num_cells] = cols_to_add[index]
                        values[num_cells] = values_to_add[index]

                        num_cells += 1

                if time.time() - start_time_print_batch > 1:
                    print(
                        "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}"
                        .format(
                            current_block_start_row, 100.0 *
                            float(current_block_start_row) / p_ui.shape[1],
                            (time.time() - start_time) / 60,
                            float(current_block_start_row) /
                            (time.time() - start_time)))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print_batch = time.time()

            self.W_sparse = sps.csr_matrix(
                (values[:num_cells], (rows[:num_cells], cols[:num_cells])),
                shape=(p_ui.shape[1], p_ui.shape[1]))

            if self.normalize_similarity:
                self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)

            if self.top_k:
                self.W_sparse = similarityMatrixTopK(self.W_sparse,
                                                     k=self.top_k)

            self.W_sparse = check_matrix(self.W_sparse, format='csr')
            if save_matrix:
                sps.save_npz("../tmp/RP3beta_similarity_matrix.npz",
                             self.W_sparse)
                print("Matrix saved!")
        else:
            print("Loading RP3beta_similarity_matrix.npz file...")
            self.W_sparse = sps.load_npz(
                "../tmp/RP3beta_similarity_matrix.npz")
            print("Matrix loaded!")
示例#5
0
    def fit(self,
            urm_train,
            l1_ratio=1,
            positive_only=True,
            top_k=100,
            save_matrix=False,
            load_matrix=False):

        self.urm_train = urm_train

        if not load_matrix:
            assert 0 <= l1_ratio <= 1, \
                "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format(l1_ratio)

            self.l1_ratio = l1_ratio
            self.positive_only = positive_only
            self.topK = top_k

            # initialize the ElasticNet model
            self.model = ElasticNet(alpha=1e-4,
                                    l1_ratio=self.l1_ratio,
                                    positive=self.positive_only,
                                    fit_intercept=False,
                                    copy_X=False,
                                    precompute=True,
                                    selection='random',
                                    max_iter=100,
                                    tol=1e-4)

            urm_train = check_matrix(self.urm_train, 'csc', dtype=np.float32)

            n_items = urm_train.shape[1]

            # Use array as it reduces memory requirements compared to lists
            data_block = 10000000

            rows = np.zeros(data_block, dtype=np.int32)
            cols = np.zeros(data_block, dtype=np.int32)
            values = np.zeros(data_block, dtype=np.float32)

            num_cells = 0

            start_time = time.time()
            start_time_print_batch = start_time

            # fit each item's factors sequentially (not in parallel)
            for currentItem in range(n_items):

                # get the target column
                y = urm_train[:, currentItem].toarray()

                if y.sum() == 0.0:
                    continue

                # set the j-th column of X to zero
                start_pos = urm_train.indptr[currentItem]
                end_pos = urm_train.indptr[currentItem + 1]

                current_item_data_backup = urm_train.data[
                    start_pos:end_pos].copy()
                urm_train.data[start_pos:end_pos] = 0.0

                # fit one ElasticNet model per column
                self.model.fit(urm_train, y)

                # self.model.coef_ contains the coefficient of the ElasticNet model
                # let's keep only the non-zero values

                # Select topK values
                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # - Partition the data to extract the set of relevant items
                # - Sort only the relevant items
                # - Get the original item index

                # nonzero_model_coef_index = self.model.coef_.nonzero()[0]
                # nonzero_model_coef_value = self.model.coef_[nonzero_model_coef_index]

                nonzero_model_coef_index = self.model.sparse_coef_.indices
                nonzero_model_coef_value = self.model.sparse_coef_.data

                local_top_k = min(len(nonzero_model_coef_value) - 1, self.topK)

                relevant_items_partition = (
                    -nonzero_model_coef_value
                ).argpartition(local_top_k)[0:local_top_k]
                relevant_items_partition_sorting = np.argsort(
                    -nonzero_model_coef_value[relevant_items_partition])
                ranking = relevant_items_partition[
                    relevant_items_partition_sorting]

                for index in range(len(ranking)):

                    if num_cells == len(rows):
                        rows = np.concatenate(
                            (rows, np.zeros(data_block, dtype=np.int32)))
                        cols = np.concatenate(
                            (cols, np.zeros(data_block, dtype=np.int32)))
                        values = np.concatenate(
                            (values, np.zeros(data_block, dtype=np.float32)))

                    rows[num_cells] = nonzero_model_coef_index[ranking[index]]
                    cols[num_cells] = currentItem
                    values[num_cells] = nonzero_model_coef_value[
                        ranking[index]]

                    num_cells += 1

                # finally, replace the original values of the j-th column
                urm_train.data[start_pos:end_pos] = current_item_data_backup

                if time.time(
                ) - start_time_print_batch > 300 or currentItem == n_items - 1:
                    print(
                        "Processed {} ( {:.2f}% ) in {:.2f} minutes. Items per second: {:.0f}"
                        .format(
                            currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            (time.time() - start_time) / 60,
                            float(currentItem) / (time.time() - start_time)))
                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print_batch = time.time()

            # generate the sparse weight matrix
            self.W_sparse = sps.csr_matrix(
                (values[:num_cells], (rows[:num_cells], cols[:num_cells])),
                shape=(n_items, n_items),
                dtype=np.float32)
            if save_matrix:
                sps.save_npz("../tmp/SLIM_ElasticNet_similarity_matrix.npz",
                             self.W_sparse)
                print("Matrix saved!")
        else:
            print("Loading SLIM_ElasticNet_similarity_matrix.npz file...")
            self.W_sparse = sps.load_npz(
                "../tmp/SLIM_ElasticNet_similarity_matrix.npz")
            print("Matrix loaded!")
    def compute_similarity(self, start_col=None, end_col=None, block_size=100):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        values = []
        rows = []
        cols = []

        start_time = time.time()
        start_time_print_batch = start_time
        processedItems = 0

        if self.adjusted_cosine:
            self.applyAdjustedCosine()

        elif self.pearson_correlation:
            self.applyPearsonCorrelation()

        elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient:
            self.useOnlyBooleanInteractions()

        # We explore the matrix column-wise
        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        # Compute sum of squared values to be used in normalization
        sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()

        # Tanimoto does not require the square root to be applied
        if not (self.tanimoto_coefficient or self.dice_coefficient
                or self.tversky_coefficient):
            sumOfSquared = np.sqrt(sumOfSquared)

        if self.asymmetric_cosine:
            sumOfSquared_to_1_minus_alpha = sumOfSquared.power(
                2 * (1 - self.asymmetric_alpha))
            sumOfSquared_to_alpha = sumOfSquared.power(2 *
                                                       self.asymmetric_alpha)

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col > 0 and start_col < self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col > start_col_local and end_col < self.n_columns:
            end_col_local = end_col

        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            # Add previous block size
            processedItems += this_block_size

            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block - start_col_block

            if time.time(
            ) - start_time_print_batch >= 30 or end_col_block == end_col_local:
                columnPerSec = processedItems / (time.time() - start_time)

                print(
                    "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min"
                    .format(
                        processedItems, processedItems /
                        (end_col_local - start_col_local) * 100, columnPerSec,
                        (time.time() - start_time) / 60))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray().squeeze()

            if self.use_row_weights:
                # item_data = np.multiply(item_data, self.row_weights)
                # item_data = item_data.T.dot(self.row_weights_diag).T
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)

            else:
                # Compute item similarities
                this_block_weights = self.dataMatrix.T.dot(item_data)

            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights
                else:
                    this_column_weights = this_block_weights[:,
                                                             col_index_in_block]

                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:

                    if self.asymmetric_cosine:
                        denominator = sumOfSquared_to_alpha[
                            columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6
                    else:
                        denominator = sumOfSquared[
                            columnIndex] * sumOfSquared + self.shrink + 1e-6

                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # Apply the specific denominator for Tanimoto
                elif self.tanimoto_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.dice_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.tversky_coefficient:
                    denominator = this_column_weights + \
                                  (sumOfSquared[columnIndex] - this_column_weights) * self.tversky_alpha + \
                                  (sumOfSquared - this_column_weights) * self.tversky_beta + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # If no normalization or tanimoto is selected, apply only shrink
                elif self.shrink != 0:
                    this_column_weights = this_column_weights / self.shrink

                # this_column_weights = this_column_weights.toarray().ravel()

                if self.TopK == 0:
                    self.W_dense[:, columnIndex] = this_column_weights

                else:
                    # Sort indices and select TopK
                    # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                    # - Partition the data to extract the set of relevant items
                    # - Sort only the relevant items
                    # - Get the original item index
                    relevant_items_partition = (
                        -this_column_weights).argpartition(self.TopK -
                                                           1)[0:self.TopK]
                    relevant_items_partition_sorting = np.argsort(
                        -this_column_weights[relevant_items_partition])
                    top_k_idx = relevant_items_partition[
                        relevant_items_partition_sorting]

                    # Incrementally build sparse matrix, do not add zeros
                    notZerosMask = this_column_weights[top_k_idx] != 0.0
                    numNotZeros = np.sum(notZerosMask)

                    values.extend(this_column_weights[top_k_idx][notZerosMask])
                    rows.extend(top_k_idx[notZerosMask])
                    cols.extend(np.ones(numNotZeros) * columnIndex)

            start_col_block += block_size

        # End while on columns

        if self.TopK == 0:
            return self.W_dense

        else:

            W_sparse = sps.csr_matrix((values, (rows, cols)),
                                      shape=(self.n_columns, self.n_columns),
                                      dtype=np.float32)

            return W_sparse