示例#1
0
    def fit(self, topK=350, shrink=10, sim_type='splus'):

        m = similarity(self.icm.T, k=topK, sim_type=sim_type, shrink=shrink)
        m = self._check_matrix(m, format='csr')
        self.sim_matrix = normalize(m, norm='l2', axis=0)
        
        self.r_hat = self.urm.dot(self.sim_matrix)
示例#2
0
    def _compute_W_sparse(self, use_incremental=False):

        if use_incremental:
            feature_weights = self.D_incremental
        else:
            feature_weights = self.D_best

        self.W_sparse = similarity(self.ICM.T, shrink=0, k=self.topK)
        self.sparse_weights = True
示例#3
0
    def fit(self, topK=210, shrink=460, sim_type='jaccard'):

        self.topK = topK
        self.shrink = shrink

        m = similarity(self.urm, k=topK, sim_type=sim_type, shrink=shrink)
        m = self._check_matrix(m, format='csr')
        self.sim_matrix = normalize(m, norm='l2', axis=1)

        self.r_hat = self.urm.dot(self.sim_matrix)
示例#4
0
    def fit(self, topK=110, shrink=350, sim_type='cosine'):

        self.topK = topK
        self.shrink = shrink

        m = similarity(self.urm.T, k=topK, sim_type=sim_type, shrink=shrink)
        m = self._check_matrix(m, format='csr')
        self.sim_matrix = normalize(m, norm='l2', axis=1)

        self.r_hat = self.sim_matrix.dot(self.urm)
示例#5
0
    def fit(self, topK=None, l2_norm=1e3, normalize_matrix=False):

        start_time = time.time()
        print("|{}| Fitting model... |".format(self.NAME))

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm='l2', axis=1)
            self.URM_train = normalize(self.URM_train, norm='l2', axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X X^t, compute dot product
        grahm_matrix = similarity(self.URM_train,
                                  shrink=0,
                                  k=self.URM_train.shape[1],
                                  sim_type="cosine").toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        grahm_matrix[diag_indices] += l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        print("Fitting model... done in {:.2f}".format(time.time() -
                                                       start_time))

        # Check if the matrix should be saved in a sparse or dense format
        # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota %
        if topK is not None:
            B = self._similarity_matrix_topk(B, k=topK, verbose=False)

        if self._is_content_sparse_check(B):
            self._print("Detected model matrix to be sparse, changing format.")
            self.W_sparse = self._check_matrix(B,
                                               format='csr',
                                               dtype=np.float32)

        else:
            self.W_sparse = self._check_matrix(B,
                                               format='npy',
                                               dtype=np.float32)
            self._W_sparse_format_checked = True

        self.r_hat = self.URM_train.dot(self.W_sparse)
示例#6
0
                        scoring=SCORE,
                        n_FAQs=NFAQS,
                        pre=PRECLASSIFIER)

    print(cv_scores)
    print(scores)

    SIM_THRESH = .965
    MODEL = "tfidf_w2v_top5w"
    # embedding(model=MODEL, data_prefix=DATA_PREFIX)
    # similarity(model=MODEL, thresh=SIM_THRESH)
    # classifier(model=MODEL, scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER)
    scores[0, :] = test(MODEL,
                        data_prefix=DATA_PREFIX,
                        scoring=SCORE,
                        n_FAQs=NFAQS,
                        pre=PRECLASSIFIER)
    MODEL = "tfidf_w2v_top5a"
    embedding(model=MODEL, data_prefix=DATA_PREFIX)
    similarity(model=MODEL, thresh=SIM_THRESH)
    classifier(model=MODEL, scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER)
    scores[1, :] = test(MODEL,
                        data_prefix=DATA_PREFIX,
                        scoring=SCORE,
                        n_FAQs=NFAQS,
                        pre=PRECLASSIFIER)

    print(scores)

    # TODO: mirror test with validate
示例#7
0
    def _generateTrainData_low_ram(self):

        print(self.NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        S_matrix_contentKNN = similarity(self.ICM.T, shrink=0, k=self.topK)
        S_matrix_contentKNN = self._check_matrix(S_matrix_contentKNN, "csr")

        self._writeLog(
            self.NAME +
            ": Collaborative S density: {:.2E}, nonzero cells {}".format(
                self.S_matrix_target.nnz /
                self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz))

        self._writeLog(
            self.NAME + ": Content S density: {:.2E}, nonzero cells {}".format(
                S_matrix_contentKNN.nnz /
                S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz))

        if self.normalize_similarity:

            # Compute sum of squared
            sum_of_squared_features = np.array(
                self.ICM.T.power(2).sum(axis=0)).ravel()
            sum_of_squared_features = np.sqrt(sum_of_squared_features)

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index

                    new_data_value = self.S_matrix_target[row_index, col_index]

                    if self.normalize_similarity:
                        new_data_value *= sum_of_squared_features[
                            row_index] * sum_of_squared_features[col_index]

                    self.data_list[num_samples] = new_data_value

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if time.time(
            ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * (
                    1 + self.add_zeros_quota):

                print(self.NAME +
                      ": Generating train data. Sample {} ( {:.2f} %) ".format(
                          num_samples, num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self._writeLog(
            self.NAME +
            ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells"
            .format(num_common_coordinates, S_matrix_contentKNN.nnz,
                    num_common_coordinates / S_matrix_contentKNN.nnz * 100))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self._writeLog(
            self.NAME +
            ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz, collaborative_sum /
                collaborative_nnz))