def test_gz_trie_match(self):
        taxonomy_matcher = Matcher(gazetteer=self.gz_file)
        text = 'ab Foo bar Foo foo chao foo\nBar    foo bar foo foo'

        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(taxonomy_matcher.matching(text))
            ],
            [
                ('Foo bar Foo', 3, 13),
                ('chao', 19, 22),
                ('foo\nBar    foo', 24, 37),
                ('bar foo foo', 39, 49)
            ]
        )

        text = 'ab Foo Foo foo foo\nBar     bar foo foo'
        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(taxonomy_matcher.matching(text))
            ],
            [
                ('foo\nBar', 15, 21),
                ('bar foo foo', 27, 37)
            ]
        )
    def test_partial_trie_match_braket(self):
        self.matcher = Matcher(gazetteer=self.gz_file.name)
        text = "November 1954 Place of Birth : Rotterdam Holland Passport : \
  dutch (Current) Domiciled in NZ : 47 years"

        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(self.matcher.matching(text))
            ],
            [
                ('dutch', 62, 66),
            ]
        )
    def test_partial_trie_match_doc_end(self):
        self.matcher = Matcher(gazetteer=self.gz_file.name)
        text = '''

foo bar
'''
        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(self.matcher.matching(text))
            ],
            [
                ('foo bar', 2, 8),
            ]
        )
class ParticalMatcherTestCases(TestCase):
    def setUp(self):
        gz_content = '''dutch
dutch (flemish)
foo bar'''
        self.gz_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
        with self.gz_file as f:
            f.write(gz_content)

    def test_partial_trie_match_braket(self):
        self.matcher = Matcher(gazetteer=self.gz_file.name)
        text = "November 1954 Place of Birth : Rotterdam Holland Passport : \
  dutch (Current) Domiciled in NZ : 47 years"

        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(self.matcher.matching(text))
            ],
            [
                ('dutch', 62, 66),
            ]
        )

    def test_partial_trie_match_doc_end(self):
        self.matcher = Matcher(gazetteer=self.gz_file.name)
        text = '''

foo bar
'''
        self.assertEqual(
            [
                (match.surface_form, match.start_pos, match.end_pos)
                for match in list(self.matcher.matching(text))
            ],
            [
                ('foo bar', 2, 8),
            ]
        )

    def tearDown(self):
        path = pathlib.Path(self.gz_file.name)
        path.unlink()
 def test_build_from_gz(self):
     taxonomy_matcher = Matcher(gazetteer=self.gz_file)
     self.assertEqual(taxonomy_matcher.trie_matcher.token_trie,
         {
         'abc': {'def': {'fed': {'xxENDxx': ('abc def fed', None)}}},
         'foo': {'bar': {'xxENDxx': ('foo bar', None),
         'foo': {'xxENDxx': ('foo bar foo', None)}}},
         'old': {'foo': {'xxENDxx': ('old foo', None)}},
         'new': {'foo': {'xxENDxx': ('new  foo', None)}},
         'bar': {'foo': {'foo': {'xxENDxx': ('bar foo foo', None)}}},
         'chao': {'xxENDxx': ('chao', None)}}
     )
Пример #6
0
    def test_likelihood_from_nt(self):
        """the likelihood with the correct value"""

        taxonomy_matcher = Matcher(normtable=self.nt_file)
        print(taxonomy_matcher.trie_matcher.token_trie)
        text = '''try this: UI programming or Ui Programming, and math tools like
Mathématiques or mathematiques, or mathématiques.'''

        matched_phrases = list(taxonomy_matcher.matching(text))
        print(matched_phrases)
        self.assertEqual(len(matched_phrases), 5)
        self.assertEqual([
            matched_phrase.surface_form for matched_phrase in matched_phrases
        ], [
            'UI programming', 'Ui Programming', 'Mathématiques',
            'mathematiques', 'mathématiques'
        ], 'extracted phrases')
        self.assertEqual([
            matched_phrase.skill_likelihood
            for matched_phrase in matched_phrases
        ], [0.5, 0.5, 0.4, 0.4, 0.4], 'skill_likelihood')
Пример #7
0
    def test_nt_trie_match(self):
        taxonomy_matcher = Matcher(normtable=self.nt_file)
        text = '''A build script is required to do the UI programming or so
called user interface programming, the whole process can be managed by applying
Oracle agile product lifecycle management with the help of Orale Apache Mahout
expert (using Oracle agile PLM and MAHOUT).
'''

        self.assertEqual(
            [(match.surface_form, match.start_pos, match.end_pos,
              match.code_description)
             for match in list(taxonomy_matcher.matching(text))],
            [('build script', 2, 13, 'Build Script'),
             ('UI programming', 37, 50, 'User Interface Programming'),
             ('user interface programming', 65, 90,
              'User Interface Programming'),
             ('Oracle agile product lifecycle management', 138, 178,
              'Oracle Agile Product Lifecycle Management'),
             ('Apache Mahout', 203, 215, 'Mahout'),
             ('Oracle agile PLM', 231, 246,
              'Oracle Agile Product Lifecycle Management'),
             ('MAHOUT', 252, 257, 'Mahout')])
Пример #8
0
    def test_empty_surface_form(self):
        """Log error on empty surface form."""

        with self.assertLogs(level='ERROR') as log:
            Matcher("tests/resource/skills-normalization-EN-empty.json")
            self.assertIn("Empty surface form", log.output[0])
Пример #9
0
 def test_build_from_nt(self):
     taxonomy_matcher = Matcher(normtable=self.nt_file)
     self.assertEqual(
         taxonomy_matcher.trie_matcher.token_trie, {
             'linked': {
                 'server': {
                     'xxENDxx': ('linked server', 'KSA8JE6A22KUR2OLU7RG')
                 }
             },
             'build': {
                 'script': {
                     'xxENDxx': ('build script', 'KSFVUGQPCSO6RS0X07G8')
                 }
             },
             'user': {
                 'interface': {
                     'programming': {
                         'xxENDxx': ('user interface programming',
                                     'KSHI3HJOWVSR6PGHQ7CA')
                     }
                 }
             },
             'ui': {
                 'programming': {
                     'xxENDxx': ('ui programming', 'KSHI3HJOWVSR6PGHQ7CA')
                 }
             },
             'oracle': {
                 'agile': {
                     'product': {
                         'lifecycle': {
                             'management': {
                                 'xxENDxx':
                                 ('oracle agile product lifecycle management',
                                  'KS0W11G2V8ETTQCKOV7S')
                             }
                         }
                     },
                     'plm': {
                         'xxENDxx':
                         ('oracle agile plm', 'KS0W11G2V8ETTQCKOV7S')
                     }
                 },
                 'apache': {
                     'mahout': {
                         'xxENDxx':
                         ('oracle apache mahout', 'KSRT0BE62KLQPF7WZC3O')
                     }
                 }
             },
             'apache': {
                 'mahout': {
                     'xxENDxx': ('apache mahout', 'KSRT0BE62KLQPF7WZC3O')
                 }
             },
             'mahout': {
                 'xxENDxx': ('Mahout', 'KSRT0BE62KLQPF7WZC3O')
             },
             'mathematiques': {
                 'xxENDxx': ('mathématiques', 'KS126706DPFD3354M7YK')
             },
             'c++': {
                 'xxENDxx': ('c++', 'KS126706DPFD3354M7Y0')
             }
         })