예제 #1
0
    def recommend_batch(self,
                        userids,
                        N=10,
                        urm=None,
                        filter_already_liked=True,
                        with_scores=False,
                        items_to_exclude=[],
                        verbose=False):
        if not self._has_fit():
            return None

        R = data.get_urm() if urm is None else urm

        if userids is None or not len(userids) > 0:
            print('Recommending for all users...')

        # compute the R^ by multiplying: R•S or S•R
        if self._matrix_mul_order == 'inverse':
            R_hat = sim.dot_product(self._sim_matrix,
                                    R,
                                    target_rows=userids,
                                    k=R.shape[0],
                                    format_output='csr',
                                    verbose=verbose)
        else:
            R_hat = sim.dot_product(R,
                                    self._sim_matrix,
                                    target_rows=userids,
                                    k=R.shape[0],
                                    format_output='csr',
                                    verbose=verbose)

        if filter_already_liked:
            # remove from the R^ the items already in the R
            R_hat[R.nonzero()] = -np.inf
        if len(items_to_exclude) > 0:
            # TO-DO: test this part because it does not work!
            R_hat = R_hat.T
            R_hat[items_to_exclude] = -np.inf
            R_hat = R_hat.T

        # make recommendations only for the target rows
        if len(userids) > 0:
            R_hat = R_hat[userids]
        else:
            userids = [i for i in range(R_hat.shape[0])]
        recommendations = self._extract_top_items(R_hat, N=N)
        return self._insert_userids_as_first_col(userids,
                                                 recommendations).tolist()
예제 #2
0
    def recommend_batch(self, userids, urm=None, N=10, filter_already_liked=True, with_scores=False, items_to_exclude=[],
                        verbose=False):
        if not self._has_fit():
            return None

        if userids is not None:
            if len(userids) > 0:
                matrix = urm[userids] if urm is not None else data.get_urm()[userids]
            else:
                return []
        else:
            print('Recommending for all users...')
            matrix = urm if urm is not None else data.get_urm()

        # compute the R^ by multiplying R•S
        self.r_hat = sim.dot_product(matrix, self._sim_matrix, target_rows=None, k=data.N_TRACKS, format_output='csr', verbose=verbose)
        
        if filter_already_liked:
            user_profile_batch = matrix
            self.r_hat[user_profile_batch.nonzero()] = -np.inf
        if len(items_to_exclude)>0:
            # TO-DO: test this part because it does not work!
            self.r_hat = self.r_hat.T
            self.r_hat[items_to_exclude] = -np.inf
            self.r_hat = self.r_hat.T
        
        recommendations = self._extract_top_items(self.r_hat, N=N)
        return self._insert_userids_as_first_col(userids, recommendations).tolist()
예제 #3
0
 def get_r_hat(self, verbose=False):
     """
     Return the r_hat matrix as: R^ = R•S or R^ = S•R
     """
     R = self.urm
     targetids = data.get_target_playlists()
     if self._matrix_mul_order == 'inverse':
         return sim.dot_product(self._sim_matrix,
                                R,
                                target_rows=targetids,
                                k=R.shape[0],
                                format_output='csr',
                                verbose=verbose)[targetids]
     else:
         return sim.dot_product(R,
                                self._sim_matrix,
                                target_rows=targetids,
                                k=R.shape[0],
                                format_output='csr',
                                verbose=verbose)[targetids]
예제 #4
0
def ipcf(df_train, UWP_sparse, n_items, alpha=0.25, q=5, k=10):
    # Construct the item-basket sparse matrix
    idMax_basket = df_train.BID.max() + 1
    item_basket_mat = sparse.coo_matrix(
        (np.ones((df_train.shape[0]), dtype=int),
         (df_train.PID.values, df_train.BID.values)),
        shape=(n_items, idMax_basket))
    # Convert it to Compressed Sparse Row format to exploit its efficiency in arithmetic operations
    sparse_mat = sparse.csr_matrix(item_basket_mat)
    # Caculate the Asymetric Cosine Similarity matrix
    itemSimMat = sim.asymmetric_cosine(sparse_mat, None, alpha, k)
    # recommend k items to users
    UWP_sparse.shape, itemSimMat.shape
    user_recommendations = sim.dot_product(UWP_sparse, itemSimMat.power(q), k)
    return user_recommendations
예제 #5
0
def upcf(df_train, UWP_sparse, n_items, alpha=0.25, q=5, k=10):
    n_users = df_train['UID'].unique().shape[0]
    df_user_item = df_train.groupby(
        ['UID', 'PID']).size().reset_index(name="bool")[['UID', 'PID']]
    # Generate the User_item matrix using the parse matrix COOrdinate format.
    userItem_mat = sparse.coo_matrix(
        (np.ones((df_user_item.shape[0])),
         (df_user_item.UID.values, df_user_item.PID.values)),
        shape=(n_users, n_items))
    # Calculate the asymmetric similarity cosine matrix
    userSim = sim.asymmetric_cosine(sparse.csr_matrix(userItem_mat),
                                    alpha=0.25,
                                    k=10)
    # recommend k items to users
    user_recommendations = sim.dot_product(userSim.power(5), UWP_sparse, k=10)
    return user_recommendations
예제 #6
0
def test_readme_code():
    import similaripy as sim
    import scipy.sparse as sps

    # create a random user-rating matrix (URM)
    urm = sps.random(1000, 2000, density=0.025)

    # normalize matrix with bm25
    urm = sim.normalization.bm25(urm)

    # train the model with 50 knn per item
    model = sim.cosine(urm.T, k=50)

    # recommend 100 items to users 1, 14 and 8 filtering the items already seen by each users
    user_recommendations = sim.dot_product(urm,
                                           model.T,
                                           k=100,
                                           target_rows=[1, 14, 8],
                                           filter_cols=urm)

    print('Test README.md code passed!!!')
예제 #7
0
def check_similarity(m, k, rtol=0.0001, full=False):
    # cython
    dot = sim.dot_product(m, k=k)
    cosine = sim.cosine(m, k=k)
    asy_cosine = sim.asymmetric_cosine(m, alpha=0.2, k=k)
    jaccard = sim.jaccard(m, k=k)
    dice = sim.dice(m, k=k)
    tversky = sim.tversky(m, alpha=0.8, beta=0.4, k=k)
    p3alpha = sim.p3alpha(m, alpha=0.8, k=k)
    rp3beta = sim.rp3beta(m, alpha=0.8, beta=0.4, k=k)

    # python
    dot2 = py_dot(m, k)
    cosine2 = py_cosine(m, k).tocsr()
    asy_cosine2 = py_asy_cosine(m, 0.2, k=k)
    jaccard2 = py_jaccard(m, k)
    dice2 = py_dice(m, k)
    tversky2 = py_tversky(m, alpha=0.8, beta=0.4, k=k)
    p3alpha2 = py_p3alpha(m, alpha=0.8, k=k)
    rp3beta2 = py_rp3beta(m, alpha=0.8, beta=0.4, k=k)

    # test
    np.testing.assert_allclose(check_sum(dot),
                               check_sum(dot2),
                               rtol=rtol,
                               err_msg='dot error')
    np.testing.assert_allclose(check_sum(cosine),
                               check_sum(cosine2),
                               rtol=rtol,
                               err_msg='cosine error')
    np.testing.assert_allclose(check_sum(asy_cosine),
                               check_sum(asy_cosine2),
                               rtol=rtol,
                               err_msg='asy_cosine error')
    np.testing.assert_allclose(check_sum(jaccard),
                               check_sum(jaccard2),
                               rtol=rtol,
                               err_msg='jaccard error')
    np.testing.assert_allclose(check_sum(dice),
                               check_sum(dice2),
                               rtol=rtol,
                               err_msg='dice error')
    np.testing.assert_allclose(check_sum(tversky),
                               check_sum(tversky2),
                               rtol=rtol,
                               err_msg='tversky error')
    np.testing.assert_allclose(check_sum(p3alpha),
                               check_sum(p3alpha2),
                               rtol=rtol,
                               err_msg='p3alpha error')
    np.testing.assert_allclose(check_sum(rp3beta),
                               check_sum(rp3beta2),
                               rtol=rtol,
                               err_msg='rp3beta error')

    # test full rows
    if full:
        np.testing.assert_(check_full(dot, dot2, rtol) == 0, msg='dot error')
        np.testing.assert_(check_full(cosine, cosine2, rtol) == 0,
                           msg='cosine error')
        np.testing.assert_(check_full(asy_cosine, asy_cosine2, rtol) == 0,
                           msg='asy_cosine error')
        np.testing.assert_(check_full(jaccard, jaccard2, rtol) == 0,
                           msg='jaccard error')
        np.testing.assert_(check_full(dice, dice2, rtol) == 0,
                           msg='dice error')
        np.testing.assert_(check_full(tversky, tversky2, rtol) == 0,
                           msg='tversky error')
        np.testing.assert_(check_full(p3alpha, p3alpha2, rtol) == 0,
                           msg='p3alpha error')
        np.testing.assert_(check_full(rp3beta, rp3beta2, rtol) == 0,
                           msg='rp3beta error')

    return
예제 #8
0
 def get_r_hat(self, verbose=False):
     """
     Return the R^ matrix as: R^ = S•H, ONLY for the target playlists last sequences
     """
     return sim.dot_product(self._sim_matrix, self.H, target_rows=self.target_indices,
                             k=self.H.shape[0], format_output='csr', verbose=verbose)
예제 #9
0
 def get_r_hat(self):
     r_hat = sim.dot_product(self.urm, self._sim_matrix, target_rows=data.get_target_playlists(),
                             k=data.N_TRACKS, format_output='csr')
     return r_hat[data.get_target_playlists()]
def instance_selection(model,
                       X_train,
                       y_train,
                       X_valid,
                       y_valid,
                       criterion,
                       sparse=True,
                       add_channel=True,
                       flatten=False,
                       treshold=False,
                       return_influences=False):
    hessian_matrix = compute_hessian(model,
                                     X_train,
                                     y_train,
                                     criterion,
                                     sparse=sparse,
                                     add_channel=add_channel,
                                     flatten=flatten,
                                     treshold=treshold)

    if sparse:
        print('Hessian matrix sparsity: {:.2f}%'.format(
            hessian_matrix.nnz / (hessian_matrix.shape[0]**2) * 100))
        hessian_matrix_inv = inv(hessian_matrix)
    else:
        hessian_matrix_inv = torch.inverse(hessian_matrix)

    selected_indices = []
    influences = []
    for i, train_sample in enumerate(tqdm(X_train, desc='Instance selection')):
        model.zero_grad()
        train_sample = np.expand_dims(train_sample, axis=0)
        train_sample = torch.Tensor(train_sample)
        label = y_train[i]
        label = torch.LongTensor([label])
        train_sample, model, label = train_sample.to(device), model.to(
            device), label.to(device)

        if flatten:
            train_sample = train_sample.view(train_sample.shape[0], -1)

        output = model(train_sample)
        loss = criterion(output, label)
        loss.backward()

        jacobian_i = []
        for idx, param in enumerate(model.parameters()):
            jacobian_i.append(param.grad.view(-1))
        jacobian_i = torch.cat(jacobian_i)

        if sparse:
            jacobian_i = sps.csc_matrix(
                jacobian_i.tolist()).transpose(copy=False)
            intermediate = sim.dot_product(hessian_matrix_inv,
                                           jacobian_i,
                                           verbose=False)
        else:
            jacobian_i = jacobian_i.to(device)
            intermediate = torch.matmul(hessian_matrix_inv, jacobian_i)

        j_loss = 0
        for j, valid_sample in enumerate(X_valid):
            model.zero_grad()
            valid_sample = np.expand_dims(valid_sample, axis=0)
            valid_sample = torch.Tensor(valid_sample)
            label = y_valid[j]
            label = torch.LongTensor([label])
            valid_sample, model, label = valid_sample.to(device), model.to(
                device), label.to(device)

            if flatten:
                valid_sample = valid_sample.view(valid_sample.shape[0], -1)

            output = model(valid_sample)
            loss = criterion(output, label)
            loss.backward()

            jacobian_j = []
            for param in model.parameters():
                jacobian_j.append(param.grad.view(-1))
            jacobian_j = torch.cat(jacobian_j)

            if sparse:
                jacobian_j = sps.csc_matrix(jacobian_j.tolist())
                j_loss += sim.dot_product(jacobian_j * (-1),
                                          intermediate,
                                          verbose=False).data[0]
            else:
                jacobian_j = jacobian_j.to(device)
                j_loss += torch.matmul((jacobian_j * (-1)), intermediate)

        influences.append(j_loss)
        if j_loss <= 0:
            selected_indices.append(i)

    return influences if return_influences else selected_indices