Пример #1
0
    def test_euclidean_similarity_integer(self):

        from Base.Similarity.Compute_Similarity_Euclidean import Compute_Similarity_Euclidean
        from scipy.spatial.distance import euclidean

        data_matrix = np.array([[1, 1, 0, 1], [0, 1, 1, 1], [1, 0, 1, 0]])

        n_items = data_matrix.shape[0]

        similarity_object = Compute_Similarity_Euclidean(
            sps.csr_matrix(data_matrix).T,
            topK=100,
            normalize=False,
            similarity_from_distance_mode="lin")
        W_local = similarity_object.compute_similarity()

        for vector1 in range(n_items):
            for vector2 in range(n_items):

                scipy_distance = euclidean(data_matrix[vector1, :],
                                           data_matrix[vector2, :])

                if vector1 == vector2:
                    assert W_local[
                        vector1,
                        vector2] == 0.0, "W_local[{},{}] not matching control".format(
                            vector1, vector2)

                else:
                    local_similarity = 1 / W_local[vector1, vector2]

                    assert np.allclose(
                        local_similarity, scipy_distance, atol=1e-4
                    ), "W_local[{},{}] not matching control".format(
                        vector1, vector2)
Пример #2
0
    def __init__(self,
                 dataMatrix,
                 use_implementation="cython",
                 similarity=None,
                 **args):
        """
        Interface object that will call the appropriate similarity implementation
        :param dataMatrix:
        :param use_implementation:      "density" will choose the most efficient implementation automatically
                                        "cython" will use the cython implementation, if available. Most efficient for sparse matrix
                                        "python" will use the python implementation. Most efficent for dense matrix
        :param similarity:              the type of similarity to use, see SimilarityFunction enum
        :param args:                    other args required by the specific similarity implementation
        """

        self.dense = False

        if similarity == "euclidean":
            # This is only available here
            self.compute_similarity_object = Compute_Similarity_Euclidean(
                dataMatrix, **args)

        else:

            if similarity is not None:
                args["similarity"] = similarity

            if use_implementation == "density":

                if isinstance(dataMatrix, np.ndarray):
                    self.dense = True

                elif isinstance(dataMatrix, sps.spmatrix):
                    shape = dataMatrix.shape

                    num_cells = shape[0] * shape[1]

                    sparsity = dataMatrix.nnz / num_cells

                    self.dense = sparsity > 0.5

                else:
                    print(
                        "Compute_Similarity: matrix type not recognized, calling default..."
                    )
                    use_implementation = "python"

                if self.dense:
                    print("Compute_Similarity: detected dense matrix")
                    use_implementation = "python"
                else:
                    use_implementation = "cython"

            if use_implementation == "cython":

                try:
                    from Base.Similarity.Cython.Compute_Similarity_Cython import Compute_Similarity_Cython
                    self.compute_similarity_object = Compute_Similarity_Cython(
                        dataMatrix, **args)

                except ImportError:
                    print(
                        "Unable to load Cython Compute_Similarity, reverting to Python"
                    )
                    self.compute_similarity_object = Compute_Similarity_Python(
                        dataMatrix, **args)

            elif use_implementation == "python":
                self.compute_similarity_object = Compute_Similarity_Python(
                    dataMatrix, **args)

            else:

                raise ValueError(
                    "Compute_Similarity: value for argument 'use_implementation' not recognized"
                )
Пример #3
0
    def __init__(self,
                 dataMatrix,
                 use_implementation="density",
                 similarity=None,
                 **args):
        """
        Interface object that will call the appropriate similarity implementation
        :param dataMatrix:
        :param use_implementation:      "density" will choose the most efficient implementation automatically
                                        "cython" will use the cython implementation, if available. Most efficient for sparse matrix
                                        "python" will use the python implementation. Most efficent for dense matrix
        :param similarity:              the type of similarity to use, see SimilarityFunction enum
        :param args:                    other args required by the specific similarity implementation
        """

        assert np.all(np.isfinite(dataMatrix.data)), \
            "Compute_Similarity: Data matrix contains {} non finite values".format(
                np.sum(np.logical_not(np.isfinite(dataMatrix.data))))

        self.dense = False

        if similarity == "euclidean":
            # This is only available here
            self.compute_similarity_object = Compute_Similarity_Euclidean(
                dataMatrix, **args)

        else:

            assert not (dataMatrix.shape[0] == 1 and dataMatrix.nnz == dataMatrix.shape[1]), \
                "Compute_Similarity: data has only 1 feature (shape: {}) with dense values," \
                " vector and set based similarities are not defined on 1-dimensional dense data," \
                " use Euclidean similarity instead.".format(dataMatrix.shape)

            if similarity is not None:
                args["similarity"] = similarity

            if use_implementation == "density":

                if isinstance(dataMatrix, np.ndarray):
                    self.dense = True

                elif isinstance(dataMatrix, sps.spmatrix):
                    shape = dataMatrix.shape

                    num_cells = shape[0] * shape[1]

                    sparsity = dataMatrix.nnz / num_cells

                    self.dense = sparsity > 0.5

                else:
                    print(
                        "Compute_Similarity: matrix type not recognized, calling default..."
                    )
                    use_implementation = "python"

                if self.dense:
                    print("Compute_Similarity: detected dense matrix")
                    use_implementation = "python"
                else:
                    use_implementation = "cython"

            if use_implementation == "cython":

                try:
                    from Base.Similarity.Cython.Compute_Similarity_Cython import Compute_Similarity_Cython
                    self.compute_similarity_object = Compute_Similarity_Cython(
                        dataMatrix, **args)

                except ImportError:
                    print(
                        "Unable to load Cython Compute_Similarity, reverting to Python"
                    )
                    self.compute_similarity_object = Compute_Similarity_Python(
                        dataMatrix, **args)

            elif use_implementation == "python":
                self.compute_similarity_object = Compute_Similarity_Python(
                    dataMatrix, **args)

            else:

                raise ValueError(
                    "Compute_Similarity: value for argument 'use_implementation' not recognized"
                )
Пример #4
0
    def __init__(self,
                 dataMatrix,
                 use_implementation="density",
                 similarity=None,
                 **args):
        """
        Interface object that will call the appropriate similarity implementation
        :param dataMatrix:              scipy sparse matrix |features|x|items| or |users|x|items|
        :param use_implementation:      "density" will choose the most efficient implementation automatically
                                        "cython" will use the cython implementation, if available. Most efficient for sparse matrix
                                        "python" will use the python implementation. Most efficient for dense matrix
        :param similarity:              the type of similarity to use, see SimilarityFunction enum
        :param args:                    other args required by the specific similarity implementation
        """

        assert np.all(np.isfinite(dataMatrix.data)), \
            "Compute_Similarity: Data matrix contains {} non finite values".format(np.sum(np.logical_not(np.isfinite(dataMatrix.data))))

        self.dense = False

        if similarity == "euclidean":
            # This is only available here
            self.compute_similarity_object = Compute_Similarity_Euclidean(
                dataMatrix, **args)

        else:

            columns_with_full_features = np.sum(
                np.ediff1d(sps.csc_matrix(dataMatrix).indptr) ==
                dataMatrix.shape[0])

            if similarity in [
                    'dice', 'jaccard', 'tversky'
            ] and columns_with_full_features >= dataMatrix.shape[1] / 2:
                warnings.warn(
                    "Compute_Similarity: {:.2f}% of the columns have all features, "
                    "set-based similarity heuristics will not be able to discriminate between the columns."
                    .format(columns_with_full_features / dataMatrix.shape[1] *
                            100))

            if dataMatrix.shape[
                    0] == 1 and columns_with_full_features >= dataMatrix.shape[
                        1] / 2:
                warnings.warn(
                    "Compute_Similarity: {:.2f}% of the columns have a value for the single feature the data has, "
                    "most similarity heuristics will not be able to discriminate between the columns."
                    .format(columns_with_full_features / dataMatrix.shape[1] *
                            100))

            assert not (dataMatrix.shape[0] == 1 and dataMatrix.nnz == dataMatrix.shape[1]),\
                "Compute_Similarity: data has only 1 feature (shape: {}) with values in all columns," \
                " cosine and set-based similarities are not able to discriminate 1-dimensional dense data," \
                " use Euclidean similarity instead.".format(dataMatrix.shape)

            if similarity is not None:
                args["similarity"] = similarity

            if use_implementation == "density":

                if isinstance(dataMatrix, np.ndarray):
                    self.dense = True

                elif isinstance(dataMatrix, sps.spmatrix):
                    shape = dataMatrix.shape

                    num_cells = shape[0] * shape[1]

                    sparsity = dataMatrix.nnz / num_cells

                    self.dense = sparsity > 0.5

                else:
                    print(
                        "Compute_Similarity: matrix type not recognized, calling default..."
                    )
                    use_implementation = "python"

                if self.dense:
                    print("Compute_Similarity: detected dense matrix")
                    use_implementation = "python"
                else:
                    use_implementation = "cython"

            if use_implementation == "cython":

                try:
                    from Base.Similarity.Cython.Compute_Similarity_Cython import Compute_Similarity_Cython
                    self.compute_similarity_object = Compute_Similarity_Cython(
                        dataMatrix, **args)

                except ImportError:
                    print(
                        "Unable to load Cython Compute_Similarity, reverting to Python"
                    )
                    self.compute_similarity_object = Compute_Similarity_Python(
                        dataMatrix, **args)

            elif use_implementation == "python":
                self.compute_similarity_object = Compute_Similarity_Python(
                    dataMatrix, **args)

            else:

                raise ValueError(
                    "Compute_Similarity: value for argument 'use_implementation' not recognized"
                )