def test_gz_trie_match(self):
        taxonomy_matcher = Matcher(gazetteer=self.gz_file)
        text = 'ab Foo bar Foo foo chao foo\nBar    foo bar foo foo'

        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(taxonomy_matcher.matching(text))
            ],
            [
                ('Foo bar Foo', 3, 13),
                ('chao', 19, 22),
                ('foo\nBar    foo', 24, 37),
                ('bar foo foo', 39, 49)
            ]
        )

        text = 'ab Foo Foo foo foo\nBar     bar foo foo'
        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(taxonomy_matcher.matching(text))
            ],
            [
                ('foo\nBar', 15, 21),
                ('bar foo foo', 27, 37)
            ]
        )
class ParticalMatcherTestCases(TestCase):
    def setUp(self):
        gz_content = '''dutch
dutch (flemish)
foo bar'''
        self.gz_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
        with self.gz_file as f:
            f.write(gz_content)

    def test_partial_trie_match_braket(self):
        self.matcher = Matcher(gazetteer=self.gz_file.name)
        text = "November 1954 Place of Birth : Rotterdam Holland Passport : \
  dutch (Current) Domiciled in NZ : 47 years"

        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(self.matcher.matching(text))
            ],
            [
                ('dutch', 62, 66),
            ]
        )

    def test_partial_trie_match_doc_end(self):
        self.matcher = Matcher(gazetteer=self.gz_file.name)
        text = '''

foo bar
'''
        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(self.matcher.matching(text))
            ],
            [
                ('foo bar', 2, 8),
            ]
        )

    def tearDown(self):
        path = pathlib.Path(self.gz_file.name)
        path.unlink()
Пример #3
0
    def test_likelihood_from_nt(self):
        """the likelihood with the correct value"""

        taxonomy_matcher = Matcher(normtable=self.nt_file)
        print(taxonomy_matcher.trie_matcher.token_trie)
        text = '''try this: UI programming or Ui Programming, and math tools like
Mathématiques or mathematiques, or mathématiques.'''

        matched_phrases = list(taxonomy_matcher.matching(text))
        print(matched_phrases)
        self.assertEqual(len(matched_phrases), 5)
        self.assertEqual([
            matched_phrase.surface_form for matched_phrase in matched_phrases
        ], [
            'UI programming', 'Ui Programming', 'Mathématiques',
            'mathematiques', 'mathématiques'
        ], 'extracted phrases')
        self.assertEqual([
            matched_phrase.skill_likelihood
            for matched_phrase in matched_phrases
        ], [0.5, 0.5, 0.4, 0.4, 0.4], 'skill_likelihood')
Пример #4
0
    def test_nt_trie_match(self):
        taxonomy_matcher = Matcher(normtable=self.nt_file)
        text = '''A build script is required to do the UI programming or so
called user interface programming, the whole process can be managed by applying
Oracle agile product lifecycle management with the help of Orale Apache Mahout
expert (using Oracle agile PLM and MAHOUT).
'''

        self.assertEqual(
            [(match.surface_form, match.start_pos, match.end_pos,
              match.code_description)
             for match in list(taxonomy_matcher.matching(text))],
            [('build script', 2, 13, 'Build Script'),
             ('UI programming', 37, 50, 'User Interface Programming'),
             ('user interface programming', 65, 90,
              'User Interface Programming'),
             ('Oracle agile product lifecycle management', 138, 178,
              'Oracle Agile Product Lifecycle Management'),
             ('Apache Mahout', 203, 215, 'Mahout'),
             ('Oracle agile PLM', 231, 246,
              'Oracle Agile Product Lifecycle Management'),
             ('MAHOUT', 252, 257, 'Mahout')])