def test_build_matrix(self):
     """Should create a csr matrix only master"""
     test_series = pd.Series(['foo', 'bar', 'baz'])
     sg = StringGrouper(test_series)
     master, dupe = sg._get_tf_idf_matrices()
     c = csr_matrix([[0., 0., 1.], [1., 0., 0.], [0., 1., 0.]])
     np.testing.assert_array_equal(c.toarray(), master.toarray())
     np.testing.assert_array_equal(c.toarray(), dupe.toarray())
Exemplo n.º 2
0
    def test_build_matches(self):
        """Should create the cosine similarity matrix of two series"""
        test_series_1 = pd.Series(['foo', 'bar', 'baz'])
        test_series_2 = pd.Series(['foo', 'bar', 'bop'])
        sg = StringGrouper(test_series_1, test_series_2)
        master, dupe = sg._get_tf_idf_matrices()

        expected_matches = np.array([[1., 0., 0.]
                                     , [0., 1., 0.]
                                     , [0., 0., 0.]])
        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray())
    def test_build_matrix_master_and_duplicates(self):
        """Should create a csr matrix for master and duplicates"""
        test_series_1 = pd.Series(['foo', 'bar', 'baz'])
        test_series_2 = pd.Series(['foo', 'bar', 'bop'])
        sg = StringGrouper(test_series_1, test_series_2)
        master, dupe = sg._get_tf_idf_matrices()
        master_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.],
                                      [0., 1., 0., 0.]])
        dupes_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.],
                                     [0., 0., 1., 0.]])

        np.testing.assert_array_equal(master_expected.toarray(),
                                      master.toarray())
        np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray())