Пример #1
0
 def similarity_from_sparse(matrix_a: sparse.csr_matrix,
                            matrix_b: sparse.csr_matrix):
     intersection = matrix_a.dot(matrix_b.transpose()).toarray()
     norm_1 = np.array(matrix_a.multiply(matrix_a).sum(axis=1))
     norm_2 = np.array(matrix_b.multiply(matrix_b).sum(axis=1))
     union = norm_1 + norm_2.T - intersection
     return intersection / union
Пример #2
0
def filter_dataset(X: sp.csr_matrix, min_ui_count=10, min_iu_count=10):

    X = filter_rows(X, min_ui_count)
    T_X = X.transpose()
    T_X = filter_rows(T_X, min_iu_count)
    X = T_X.transpose()
    return X
Пример #3
0
    def augmentURM(cls, URM_train: csr_matrix, W_sparse: csr_matrix,
                   threshold_interactions: int, threshold_similarity: float):
        """
        Augmentation of the URM train.

        :param threshold_interactions: here a threshold on the similarity is considered.
        Similarity matrix W_sparse will be considered for this purpose
        :param threshold_similarity: threshold used to insert a new row.
        In this case it is specified as the minimum number of interactions required to insert a new
        row in the URM train
        :param W_sparse: similarity matrix
        :param URM_train: URM train that will be augmented
        :return: a csr_matrix with augmented interactions according to the threshold
        """
        print("Augmenting URM")
        URM_train = URM_train.copy()

        # Count similarity
        count_W_sparse = URM_train.dot(URM_train.transpose())

        # Selecting new
        print("Selecting new candidates")
        users = np.arange(URM_train.shape[0])
        new_rows_list = []
        for i in range(0, users.size):
            if i % 5000 == 0:
                print("{} done in {}".format(i, users.size))
            candidates = count_W_sparse[i].indices  # users candidates
            data = count_W_sparse[i].data  # data for the candidates

            for j, candidate in enumerate(candidates):
                if candidate > i and data[
                        j] > threshold_interactions and W_sparse[
                            i, candidate] > threshold_similarity:
                    new_rows_list.append([i, candidate])

        print("Candidate list size: {}".format(len(new_rows_list)))

        # Creating the new matrix
        print("Creating new URM...", end="")
        new_URM = None
        for candidate in new_rows_list:
            new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0)
            new_row = csr_matrix(new_row)
            new_row.data[new_row.data > 1] = 1

            if new_URM is None:
                new_URM = new_row
            else:
                new_URM = vstack([new_URM, new_row], format="csr")

        if new_URM is None:
            new_URM = URM_train
        else:
            new_URM = vstack([URM_train, new_URM], format="csr")

        print("Done")

        return new_URM
Пример #4
0
def psinv2(matr: csr_matrix, dtype, reg=0.):
    # inverse operation for sparse matrices doesn't seem to exist in cupy
    regsize = matr.shape[1]
    if reg == 0:
        regm = csr_matrix((regsize, regsize))
    else:
        regm = identity(regsize, dtype=dtype) * (
            1. / reg)  # if direct cuda, call get_sparse_module()
    toinv = matr.transpose().dot(matr) + regm
    return get_sparse_module(
        inv(toinv))  # if direct cuda, add .get() to inv param
Пример #5
0
    def newAugmentUMR(cls, URM_train: csr_matrix, W_sparse: csr_matrix,
                      threshold_interactions: int,
                      threshold_similarity: float):
        print("New Augmenting URM")
        count_W_sparse = URM_train.dot(URM_train.transpose())
        count_mask: csr_matrix = count_W_sparse > threshold_interactions
        sim_mask: csr_matrix = W_sparse > threshold_similarity
        mask = count_mask.multiply(sim_mask)
        mask = triu(mask)
        mask = mask.tocoo()

        row_user = mask.row
        col_user = mask.col

        new_mask = row_user != col_user
        row_user = row_user[new_mask]
        col_user = col_user[new_mask]
        new_users = np.array([row_user, col_user])
        new_users = np.transpose(new_users)
        new_rows_list: list = new_users.tolist()

        print("Candidate list size: {}".format(len(new_rows_list)))

        # Creating the new matrix
        print("Creating new URM...", end="")
        new_URM = None
        for candidate in new_rows_list:
            new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0)
            new_row = csr_matrix(new_row)
            new_row.data[new_row.data > 1] = 1

            if new_URM is None:
                new_URM = new_row
            else:
                new_URM = vstack([new_URM, new_row], format="csr")

        if new_URM is None:
            new_URM = URM_train
        else:
            new_URM = vstack([URM_train, new_URM], format="csr")

        print("Done")

        return new_URM
Пример #6
0
def _merge_into_adjacency(
        tf_ids: csr_matrix, word_cooccurrences_list: List[Tuple[str, str,
                                                                float]],
        token_to_int_vocab_map: Dict[str, int]) -> csr_matrix:
    """
    Merge the word co-occurence information together with the tf-idf information to create an adjacency matrix
    where,
        (0, 0) to (|vocab|, |vocab|) - indices describe the word-word interactions
    and,
        (|vocal|, |vocab|) to (|vocab| + #Docs, |vocab|) - indices describe the word-document interactions.
    """
    word_co_row = np.array([
        token_to_int_vocab_map[word_cooccurrence[0]]
        for word_cooccurrence in word_cooccurrences_list
    ])
    word_co_col = np.array([
        token_to_int_vocab_map[word_cooccurrence[1]]
        for word_cooccurrence in word_cooccurrences_list
    ])
    word_co_data = np.array([
        word_cooccurrence[2] for word_cooccurrence in word_cooccurrences_list
    ])
    word_coocurrences = csr_matrix(
        (word_co_data, (word_co_row, word_co_col)),
        shape=(len(token_to_int_vocab_map), len(token_to_int_vocab_map)))
    # Stack word co-occurences ontop of TF-IDF (Left hand side of adjacency)
    adj_lhs = vstack([word_coocurrences, tf_ids])
    # Empty (zeros) for doc-doc interactions
    zero_csr = csr_matrix(([], ([], [])),
                          shape=(tf_ids.shape[0], tf_ids.shape[0]))
    # Mirror TF-IDFs and stack ontop of doc-doc interactions to create right hand side of the adjacency
    adj_rhs = vstack([tf_ids.transpose(), zero_csr])
    # Stack side-by-side
    adj = hstack([adj_lhs, adj_rhs]) + identity(adj_lhs.shape[0])

    assert adj.shape == (
        len(token_to_int_vocab_map) + tf_ids.shape[0],
        len(token_to_int_vocab_map) + tf_ids.shape[0],
    ), "Expected {} == {}".format(
        adj.shape, (len(token_to_int_vocab_map) + tf_ids.shape[0],
                    len(token_to_int_vocab_map) + tf_ids.shape[0]))
    return adj
def calc_objective_per_iter(w_i, feature_mat: sparse.csr_matrix, empirical_counts, num_h, true_tags, alpha):
    """
        Calculate max entropy likelihood for an iterative optimization method
        :param alpha: the regularization coefficient
        :param num_h: number of histories in the training data
        :param empirical_counts: pre computed empirical_counts
        :param w_i: weights vector in iteration i


            The function returns the Max Entropy likelihood (objective) and the objective gradient
    """
    scores = feature_mat.dot(w_i)
    scores = scores.reshape((num_h, -1))
    exp_scores = np.exp(scores)
    sum_exp = np.sum(exp_scores, axis=1)
    probs = exp_scores/sum_exp.reshape((num_h, 1))
    expected_counts = feature_mat.transpose().dot(probs.reshape(-1)).reshape(-1)
    likelihood = np.sum(scores[np.arange(num_h), true_tags] - np.log(sum_exp) - (alpha/2) * (w_i**2))
    grad = empirical_counts - expected_counts - alpha*w_i
    return (-1) * likelihood, (-1) * grad
def snn_dissimilarity_func(graph: csr_matrix, n_neighbors: int, *args,
                           **kwargs) -> csr_matrix:
    """Default SNN dissimilarity function

    Computes the dissimilarity between two points in terms of shared nearest neighbors

    Args:
        graph (scipy.sparse.csr_matrix): sparse matrix with dimensions (n_samples, n_samples),
         where the element ij represents the distance between the point i and j 
        n_neighbors (int): number of neighbors in the k-neighborhood search
    """

    graph.data[graph.data > 0] = 1
    n_samples = graph.shape[0]

    # Add the point as its own neighbor
    graph += spdiags(np.ones(n_samples), diags=0, m=n_samples, n=n_samples)
    matrix = graph * graph.transpose()
    matrix.sort_indices()

    # The lower the "closer"
    matrix.data = n_neighbors - matrix.data

    return matrix
Пример #9
0
 def calculate(self, X: sp.csr_matrix, q_X: sp.csr_matrix):
     q_X = normalize(q_X, norm="l1")
     sim = q_X * X.transpose()
     # sim = sim.transpose()
     return sim.toarray()[0]