예제 #1
0
def test_lsi_helper_class():
    import scipy.sparse

    X = scipy.sparse.rand(100, 10000)
    lsi = _TruncatedSVD_LSI(n_components=20)
    lsi.fit(X)
    X_p = lsi.transform_lsi(X)
    X_p2 = lsi.transform_lsi_norm(X)
    assert lsi.components_.shape == (20, X.shape[1])
    assert X_p.shape == (100, 20)
    assert X_p2.shape == (100, 20)
예제 #2
0
    def fit_transform(self, n_components=150, n_iter=5, alpha=0.33):
        """
        Perform the SVD decomposition

        Parameters
        ----------
        n_components : int
           number of selected singular values (number of LSI dimensions)
        n_iter : int
           number of iterations for the stochastic SVD algorithm

        Returns
        -------
        mid : str
           model id
        lsi : _BaseWrapper
           the TruncatedSVD object
        exp_var : float
           the explained variance of the SVD decomposition
        """
        parent_id = self.pipeline.mid

        dsid_dir = self.fe.dsid_dir
        if not dsid_dir.exists():
            raise IOError

        pars = {'parent_id': parent_id, 'n_components': n_components}

        mid_dir_base = dsid_dir / self._wrapper_type

        mid, mid_dir = setup_model(mid_dir_base, mid=self.mid, mode=self.mode)

        ds = self.pipeline.data
        n_components_opt = _compute_lsi_dimensionality(n_components,
                                                       *ds.shape,
                                                       alpha=alpha)
        svd = _TruncatedSVD_LSI(n_components=n_components_opt,
                                n_iter=n_iter,
                                random_state=self.random_state)
        lsi = svd
        lsi.fit(ds)

        ds_p = lsi.transform_lsi_norm(ds)

        joblib.dump(pars, str(mid_dir / 'pars'))
        joblib.dump(lsi, str(mid_dir / 'model'))
        joblib.dump(ds_p, str(mid_dir / 'data'))

        exp_var = lsi.explained_variance_ratio_.sum()
        self.mid = mid

        return lsi, exp_var
예제 #3
0
def test_lsi_book_example():
    """ LSI example taken from the "Information retrieval" (2004) book by Grossman & Ophir

    This illustrates the general principle of LSI using sklearn API with _TruncatedSVD_LSI
    """

    # replacing "a" with "aa" as the former seems to be ignored by the CountVectorizer
    documents = [
        "Shipment of gold damaged in aa fire.",
        "Delivery of silver arrived in aa silver truck.",
        "Shipment of gold arrived in aa truck.",
    ]
    querry = "gold silver truck"
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import scipy.linalg
    dm_vec = CountVectorizer()
    dm_vec.fit(documents)
    X = dm_vec.transform(documents)

    assert X.shape[1] == 11
    assert X.sum(
    ) == 22  # checking the total number of elements in the document matrix

    #print(X.todense().T)
    q = dm_vec.transform([querry])

    lsi = _TruncatedSVD_LSI(n_components=2)  #, algorithm='arpack')

    lsi.fit(X)
    X_p = lsi.transform_lsi(X)
    q_p = lsi.transform_lsi(q)

    U, s, Vh = scipy.linalg.svd(X.todense().T, full_matrices=False)
    #print(' ')
    #print(U[:, :-1])

    q_p_2 = q.dot(U[:, :-1]).dot(np.diag(1. / s[:-1]))
    assert_allclose(np.abs(q_p_2), np.array([[0.2140, 0.1821]]), 1e-3)
    X_p_2 = X.dot(U[:, :-1]).dot(np.diag(1. / s[:-1]))

    assert_allclose(np.abs(np.abs(X_p_2)), np.abs(X_p))
    assert_allclose(np.abs(np.abs(q_p_2)), np.abs(q_p))
    #print(lsi.Sigma)
    #print(' ')
    #print(X_p)
    #print(q_p)

    D = cosine_similarity(X_p, q_p)

    assert_allclose(D[:2],
                    np.array([-0.05, 0.9910, 0.9543])[:2, None], 2e-2, 1e-2)
예제 #4
0
def test_search(kind):
    # testing that search algorithm actually works
    corpus = [
        "To be, or not to be; that is the question;",
        "Whether ‘tis nobler in the mind to suffer",
        "The slings and arrows of outrageous fortune,",
        "Or to take arms against a sea of troubles,",
        "And by opposing end them. To die: to sleep:",
        "Nor more; and by a sleep to say we end",
        "The heart-ache and the thousand natural shocks",
        "That flesh is heir to; ‘tis a consummation",
        "Devoutly to be wished. To die; to sleep;",
        "To sleep: perchance to dream: aye, there is the rub;",
        "For in that sleep of death what dreams may come,",
        "When we have shuffled off this mortal coil,",
        "Must give us pause: there’s the respect",
        "That makes calamity of so long life;"
    ]

    vect = CountVectorizer()
    X_tf = vect.fit_transform(corpus)
    idf = SmartTfidfTransformer('nnc')
    X_vect = idf.fit_transform(X_tf)

    if kind == 'semantic':
        lsi = _TruncatedSVD_LSI(n_components=20)
        lsi.fit(X_vect)
        X = lsi.transform_lsi_norm(X_vect)
    else:
        lsi = None
        X = X_vect

    s = Search(vect, idf, lsi)
    s.fit(X)

    for query, best_id in [(corpus[2], 2), ('death dreams', 10)]:
        dist = s.search(query)
        assert dist.shape == (X.shape[0], )
        assert dist.argmax() == best_id
        # 2 - cosine distance should be in [0, 2]
        assert_array_less(dist, 1.001)
        assert_array_less(-1 - 1e-9, dist)