示例#1
0
    def __init__(
            self,
            normtable=None,
            gazetteer=None,
            codetable=None,
            blacklist=None,
            regexp=None,
            with_context=False
    ):
        self.regexp = regexp
        self.tokenizer = Tokenizer(self.regexp)
        self.blacklist = dict()
        self.with_context = with_context

        if normtable:
            match_patterns = PatternsNT(self.tokenizer, normtable)
        elif gazetteer:
            match_patterns = PatternsGZ(self.tokenizer, gazetteer)
        elif codetable:
            match_patterns = PatternsCT(self.tokenizer, codetable)
        else:
            raise Exception('source file is required to build a \
            Matcher object')

        self.code_property_mapping = match_patterns.codeid_description
        self.meta_info = match_patterns.meta_info

        if blacklist:
            self.blacklist = data_utils.from_file_to_list(blacklist)

        self.trie_matcher = TokenTrie(
            patterns=match_patterns.tokenized_pattern
        )
示例#2
0
def tokenize(string):
    '''tokenize string, and return the list of normalized tokens'''
    string = normalize_chars(string)
    try:
        TOKENIZER  # NOQA
    except NameError:
        TOKENIZER = Tokenizer()

    return [_norm_token(token) for token in TOKENIZER.tokenize(string)]
def main():
    args = get_args()
    trxml_miner = TRXMLMiner(TEXT_FIELD)

    if os.path.isdir(args.input):
        files = [
            os.path.join(args.input, file) for file in os.listdir(args.input)
        ]
    elif os.path.isfile(args.input):
        files = [args.input]
    else:
        raise ValueError('Input not exist')

    model, vocab_to_ids = _load_model_and_vocab(args)
    tokenizer = Tokenizer()

    for file in files:
        selected_value = list(trxml_miner.mine(file))
        input_text = selected_value[0]['values'][TEXT_FIELD]
        data = _input_text_to_pad_id(input_text, vocab_to_ids, tokenizer)
        result = model(data)
        probabilities = result['probabilities'][0]
        print(file, probabilities)
示例#4
0
class TokenTrieTestCases(TestCase):
    def setUp(self):
        self.gz_file = 'tests/resource/gazetteer.txt'
        self.text = 'ab Foo bar Foo foo chao foo\nBar    foo bar foo foo'
        self.tokenizer = Tokenizer()


    def test_append_list_to_local_trie(self):
        token_trie = TokenTrie(patterns=None)
        patterns = TokenizedPattern(["a", "b", "c", "d"])
        self.assertEqual(
            token_trie._append_token_list_to_trie(patterns),
            {'a': {'b': {'c': {'d': {'xxENDxx': ('', None)}}}}}
        )
        patterns = TokenizedPattern(["a"])
        self.assertEqual(
            token_trie._append_token_list_to_trie(patterns),
            {'a': {'xxENDxx':('', None)}}
        )
        patterns = TokenizedPattern([])
        self.assertEqual(
            token_trie._append_token_list_to_trie(patterns),
            {'xxENDxx':('', None)}
        )

    def test_build_from_patterns(self):
        patterns = ["foo bar",
                    "foo",
                    "foo bar foo",
                    "bar bar foo",
                    "bar foo foo"]
        tokenzied_patterns = (
            TokenizedPattern(self.tokenizer.tokenize(normalize(pattern)), pattern)
            for pattern in patterns
        )

        token_trie = TokenTrie(patterns=tokenzied_patterns)
        self.assertEqual(token_trie.token_trie,
            {'bar':
                {'bar':
                    {'foo':
                        {'xxENDxx': ('bar bar foo', None)}
                    },
                 'foo':
                    {'foo':
                        {'xxENDxx': ('bar foo foo', None)}
                    }
                 },
             'foo':
                {'bar':
                    {'foo':
                        {'xxENDxx': ('foo bar foo', None)},
                     'xxENDxx': ('foo bar', None)},
                 'xxENDxx': ('foo', None)}
            })

    def test_build_from_repeated_pattern(self):
        patterns = ['b tree', 'b - tree', 'B - tree', 'AS/400', 'as 400']
        tokenzied_patterns = (
            TokenizedPattern(self.tokenizer.tokenize(normalize(pattern)), pattern)
            for pattern in patterns
        )
        token_trie = TokenTrie(patterns=tokenzied_patterns)
        self.assertEqual(token_trie.token_trie,
            {'b':
                {'tree':
                    {'xxENDxx': ('b tree', None)},
                 '-':
                    {'tree':
                        {'xxENDxx': ('B - tree', None)}
                    }
                },
             'as':
                {'/':
                    {'400':
                        {'xxENDxx': ('AS/400', None)}
                    },
                '400':
                    {'xxENDxx': ('as 400', None)}
                }
            }
        )



    def test_match_at_position(self):
        patterns = PatternsGZ(self.tokenizer, self.gz_file)
        token_trie = TokenTrie(patterns=patterns.tokenized_pattern)
        tokens = self.tokenizer.tokenize_with_pos_info(normalize(self.text))

        self.assertEqual(
            [
                [token.text for token in match.tokens]
                for match in (token_trie.match_at_position(
                                token_trie.token_trie,
                                tokens[1:])
                             )
            ],
            [['foo', 'bar'], ['foo', 'bar', 'foo']]
        )

        self.assertEqual(
            [
                [token.text for token in match.tokens]
                for match in (token_trie.match_at_position(
                                token_trie.token_trie,
                                tokens[2:])
                             )
            ],
            [['bar', 'foo', 'foo']]
        )

        self.assertEqual(
            [
                [token.text for token in match.tokens]
                for match in (token_trie.match_at_position(
                                token_trie.token_trie,
                                tokens[3:])
                             )
            ],
            []
        )


    def test_longest_match_at_position(self):
        patterns = PatternsGZ(self.tokenizer, self.gz_file)
        token_trie = TokenTrie(patterns=patterns.tokenized_pattern)

        tokens = self.tokenizer.tokenize_with_pos_info(normalize(self.text))

        self.assertEqual(
            [
                token.text
                for token in token_trie.longest_match_at_position(
                                token_trie.token_trie,
                                tokens[1:]).tokens
            ],
            ['foo', 'bar', 'foo']
        )

        self.assertEqual(
            [
                token.text
                for token in token_trie.longest_match_at_position(
                                token_trie.token_trie,
                                tokens[2:]).tokens
            ],
            ['bar', 'foo', 'foo']
        )

        self.assertEqual(
            [
                token.text
                for token in token_trie.longest_match_at_position(
                                token_trie.token_trie,
                                tokens[3:])
            ],
            []
        )
示例#5
0
 def setUp(self):
     self.gz_file = 'tests/resource/gazetteer.txt'
     self.text = 'ab Foo bar Foo foo chao foo\nBar    foo bar foo foo'
     self.tokenizer = Tokenizer()
示例#6
0
class Matcher():
    '''
    Matcher finds all matched phrases from the input text. It contains
        - an internal tokenizer applied on both patterns and input text
        - a token trie structure created eithor from gazetteer or codetable
        - a dictionary to map codeID to code_description and code_category

    Params:
        - normtable: normalized table file in json format
        - gazetteer: gazetteer file
        - codetable: taxonomy codetable format file
        - blacklist: blacklist file
        - with_context: also output the context if set to True
    '''

    # the magic number is related to the average length of the context in the
    # training data
    CONTEXT_LENGTH = 14

    def __init__(
            self,
            normtable=None,
            gazetteer=None,
            codetable=None,
            blacklist=None,
            regexp=None,
            with_context=False
    ):
        self.regexp = regexp
        self.tokenizer = Tokenizer(self.regexp)
        self.blacklist = dict()
        self.with_context = with_context

        if normtable:
            match_patterns = PatternsNT(self.tokenizer, normtable)
        elif gazetteer:
            match_patterns = PatternsGZ(self.tokenizer, gazetteer)
        elif codetable:
            match_patterns = PatternsCT(self.tokenizer, codetable)
        else:
            raise Exception('source file is required to build a \
            Matcher object')

        self.code_property_mapping = match_patterns.codeid_description
        self.meta_info = match_patterns.meta_info

        if blacklist:
            self.blacklist = data_utils.from_file_to_list(blacklist)

        self.trie_matcher = TokenTrie(
            patterns=match_patterns.tokenized_pattern
        )

    def matching(self, text):
        '''
        find all matching phrases from the input text

        params:
            - text: string

        output:
            - all matching phrases as MatchedPhrase object
        '''
        tokens = self.tokenizer.tokenize_with_pos_info(text)
        for token in tokens:
            token.text = data_utils.normalize(token.text)
        idx = 0
        nr_tokens = len(tokens)
        while idx < nr_tokens:
            local_match = self.trie_matcher.longest_match_at_position(
                self.trie_matcher.token_trie, tokens[idx:])

            if local_match:
                start_pos, end_pos = local_match.text_range()
                left_context, right_context = self.prepare_context(tokens,
                                                                   local_match,
                                                                   idx,
                                                                   text)
                surface_form = local_match.surface_form
                matched_text = text[start_pos:end_pos]
                yield MatchedPhrase(
                    surface_form,
                    matched_text,
                    start_pos,
                    end_pos - 1,  # prepare for the entity fromwork (in perl)
                    local_match.code_id,
                    self.code_id_property_lookup(local_match.code_id, 'desc'),
                    self.code_id_property_lookup(local_match.code_id, 'type'),
                    left_context,
                    right_context,
                    self.code_id_property_lookup(
                            local_match.code_id,
                            'skill_likelihoods',
                            dict()
                    ).get(data_utils.normalize(surface_form), None)
                )
                idx += len(local_match.tokens)
            else:
                idx += 1

    def prepare_context(self, tokens, local_match, idx, text):
        l_context = ''
        r_context = ''
        if self.with_context:
            nr_matched_tokens = len(local_match.tokens)
            l_context_begin = max(0, idx - self.CONTEXT_LENGTH)
            l_context_end = idx
            r_context_begin = idx + nr_matched_tokens
            r_context_end = min(
                len(tokens),
                r_context_begin + self.CONTEXT_LENGTH
            )
            if l_context_begin < l_context_end:
                l_context = text[tokens[l_context_begin].start_pos:
                                 tokens[l_context_end - 1].end_pos]
            if r_context_begin < r_context_end:
                r_context = text[tokens[r_context_begin].start_pos:
                                 tokens[r_context_end - 1].end_pos]
        return l_context, r_context

    def code_id_property_lookup(self, code_id, property_name, default=None):
        code_property = default
        if code_id is not None:
            if code_id in self.code_property_mapping:
                code_property = self.code_property_mapping[code_id].get(
                    property_name, default)
            else:
                LOGGER.warning(
                    'WARNING: no property {} for codeid: {}'.
                    format(property_name, code_id)
                )
        return code_property
示例#7
0
import re
from easy_tokenizer.normalizer import normalize_chars
from easy_tokenizer.tokenizer import Tokenizer
from easy_tokenizer.patterns import Patterns

HAS_TOKEN_REGEXP = re.compile(r'\w')
TOKEN_REGEXP = re.compile(r'\w+|[^\w\s]+')
TOKENIZER = Tokenizer()


def tokenize(string):
    '''tokenize string, and return the list of normalized tokens'''
    string = normalize_chars(string)
    try:
        TOKENIZER  # NOQA
    except NameError:
        TOKENIZER = Tokenizer()

    return [_norm_token(token) for token in TOKENIZER.tokenize(string)]


def _norm_token(token):
    norm_token = token.upper()
    if Patterns.URL_RE.fullmatch(token):
        norm_token = 'xxURLxx'
    elif Patterns.EMAIL_RE.fullmatch(token):
        norm_token = 'xxEMAILxx'
    elif Patterns.DOMAIN_RE.fullmatch(token):
        norm_token = 'xxAT_DOMAINxx'
    elif Patterns.YEAR_RE.fullmatch(token):
        norm_token = 'xxYEARxx'
示例#8
0
 def setUp(self):
     self.tokenizer = Tokenizer()
示例#9
0
class TokenizerTestCases(TestCase):
    def setUp(self):
        self.tokenizer = Tokenizer()

    def test_simple_sentence(self):
        text = ['this is a normal sentence', 'this is a normal-sentence']
        expected_tokens = [['this', 'is', 'a', 'normal', 'sentence'],
                           ['this', 'is', 'a', 'normal', '-', 'sentence']]
        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_char_with_hat(self):
        text = [
            'Alp NADİ\nCoordination potential bidders.',
            'Hurmoğlu, Botaş BTC, 2002\nCoordination of packages.'
        ]
        expected_tokens = [[
            'Alp', 'NADİ', 'Coordination', 'potential', 'bidders', '.'
        ],
                           [
                               'Hurmoğlu', ',', 'Botaş', 'BTC', ',', '2002',
                               'Coordination', 'of', 'packages', '.'
                           ]]
        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_tokenize_hyphen(self):
        text = [
            'this is a normal-sentence', 'he is my ex-boyfriend',
            'Sociale Option PME-PMI'
        ]
        expected_tokens = [['this', 'is', 'a', 'normal', '-', 'sentence'],
                           ['he', 'is', 'my', 'ex-boyfriend'],
                           ['Sociale', 'Option', 'PME', '-', 'PMI']]

        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_tokenize_slash(self):
        text = ['Programmation C/C++/Java/PHP/Perl/JSP.']
        expected_tokens = [[
            'Programmation', 'C', '/', 'C++', '/', 'Java', '/', 'PHP', '/',
            'Perl', '/', 'JSP.'
        ]]
        for index in range(len(expected_tokens)):
            tokens = list(self.tokenizer.tokenize(text[index]))
            self.assertEqual(tokens, expected_tokens[index])

    def test_tokenize_phrase(self):
        text = [
            'office(XP)', '(ex)girlfriend', 'Happy(12)Birthday',
            'een#exception', 'AS/400'
        ]
        expected_tokens = [["office", "(", "XP", ")"],
                           ["(", "ex", ")", "girlfriend"],
                           ["Happy", "(", "12", ")", "Birthday"],
                           ['een#exception'], ['AS', '/', '400']]

        for index in range(len(expected_tokens)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_old_tokenizer_cases(self):
        text = [
            "een mooie test zin.", "foo...", "foo...bar", "l'agence",
            "d'origine", "foo,bar", "foo.bar", "Aug. foo.",
            '[email protected] [email protected] [email protected]',
            'http://www.viadeo.com/profile/34mies', '..... -----',
            '.net c++ F#', '$100,000.99 10.12 10,12 10.123 10,123 10.000,12',
            '1st', 'S.I.M.    S I M    S. I. M.    SIM.', 'end of "phrase".'
        ]
        expected_tokens = [
            ['een', 'mooie', 'test', 'zin', '.'], ['foo', '.', '.', '.'],
            ['foo', '.', '.', '.', 'bar'], ["l", "'", "agence"],
            ["d", "'", "origine"], ['foo', ',', 'bar'], ['foo.bar'],
            ['Aug.', 'foo', '.'],
            ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
            ['http://www.viadeo.com/profile/34mies'], ['.....', '-----'],
            ['.net', 'c++', 'F#'],
            [
                '$', '100,000.99', '10.12', '10,12', '10.123', '10,123',
                '10.000,12'
            ], ['1st'], ['S.I.M.', 'S', 'I', 'M', 'S.', 'I.', 'M.', 'SIM.'],
            ['end', 'of', '"', 'phrase', '"', '.']
        ]
        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_digits_extraction(self):
        text = ['BSS (S4d0, S4D1), VOIP']
        expected_tokens = [['BSS', '(', 'S4d0', ',', 'S4D1', ')', ',', 'VOIP']]
        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_dot_comman_extraction(self):
        text = ['2000 a dec.01', 'HP-UX, C++, BASIC', 'java, c++, python']
        expected_tokens = [['2000', 'a', 'dec.', '01'],
                           ['HP-UX', ',', 'C++', ',', 'BASIC'],
                           ['java', ',', 'c++', ',', 'python']]
        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    def test_url_email(self):
        text = '''http://live.textkernel.nl", [email protected] and this is
another one, here https://www.damienpontifex.com/ch/05/06/image-classifier/ or
try this one https://docs.google.com/document/d/1pd/edit?ts=5da580bc'''
        self.assertEqual(self.tokenizer.tokenize(text), [
            'http://live.textkernel.nl', '"', ',', '*****@*****.**', 'and',
            'this', 'is', 'another', 'one', ',', 'here',
            'https://www.damienpontifex.com/ch/05/06/image-classifier/', 'or',
            'try', 'this', 'one',
            'https://docs.google.com/document/d/1pd/edit?ts=5da580bc'
        ])

    def test_email(self):
        text = ["e-mail : [email protected]\ngsm"]
        self.assertEqual(
            [self.tokenizer.tokenize(phrase) for phrase in text],
            [['e-mail', ':', '*****@*****.**', 'gsm']])

    def test_url_extraction(self):
        text = self.get_url_text()
        expected_tokens = [[
            'REFERENCES', 'http://www.erdfdistribution.fr',
            'http://www.idtgv.com', 'http://www.edf-bleuciel.fr',
            'idnight.idtgv.com', 'http://investisseurs.edf.com',
            'http://www.metaxa.com', 'ina.edf.com', 'http://www.hutchinson.fr',
            'http://www.lagardere.com', 'http://www.direxi.fr',
            'http://www.fondation-jeanluclagardere.com',
            'http://www.blog-des-astucieuses.fr', 'http://www.investir.fr',
            'http://www.clubnewsparis.com', 'http://www.cointreau.fr'
        ],
                           [
                               '1', '/', '2', 'Cecile', 'NGUYEN',
                               'http://cecile.nguyen.site.voila.fr/', '7'
                           ],
                           [
                               'lien', 'suivant',
                               'http://www.gemo.tm.fr/recrutement/formulaire',
                               'Gemo'
                           ], ['http://www.01podcast.com', '-'],
                           ['http://bit.ly/j2JoOL'],
                           [
                               '2', 'ENFANTS', '-',
                               'http://caroline-podevin.com/'
                           ],
                           ['(', 'http://www.3ds.com/customer-stories/', ')'],
                           [
                               '4D', 'S.A', 'http://www.4D.fr', '.', 'ITK',
                               'http://www.internet-toolkit.com', '(', 'outil',
                               'TCP', '/', 'IP'
                           ],
                           [
                               'this', 'is', 'a', 'url', ':',
                               'http://live.textkernel.nl', ',', 'and', 'you',
                               'can', 'chat', 'with', 'it'
                           ]]
        for index in range(len(expected_tokens)):
            tokens = list(self.tokenizer.tokenize(text[index]))
            self.assertEqual(tokens, expected_tokens[index])

    def test_long_url(self):
        text = self.get_long_url_text()
        tokens = list(self.tokenizer.tokenize(text))
        #print("\n\n".join(tokens))
        self.assertEqual(len(tokens), 26)

    def test_word_cancatenate_nonword(self):
        text = [
            "**Français : Niveau élevé", "*Français", "*Anglais. (Scolaire)",
            "Arabe...", "Arabe..", "#net", "??Arabe", "'abc', why!!!"
        ]

        expected_tokens = [['*', '*', 'Français', ':', 'Niveau', 'élevé'],
                           ['*', 'Français'],
                           ['*', 'Anglais', '.', '(', 'Scolaire', ')'],
                           ['Arabe', '.', '.', '.'], ['Arabe', '.', '.'],
                           ['#net'], ['?', '?', 'Arabe'],
                           ['\'', 'abc', '\'', ',', 'why', '!', '!', '!']]
        for index in range(len(text)):
            self.assertEqual(list(self.tokenizer.tokenize(text[index])),
                             expected_tokens[index])

    @staticmethod
    def get_url_text():
        text = [
            '''REFERENCES http://www.erdfdistribution.fr
http://www.idtgv.com http://www.edf-bleuciel.fr
idnight.idtgv.com http://investisseurs.edf.com
http://www.metaxa.com ina.edf.com http://www.hutchinson.fr
http://www.lagardere.com http://www.direxi.fr
http://www.fondation-jeanluclagardere.com http://www.blog-des-astucieuses.fr
http://www.investir.fr http://www.clubnewsparis.com http://www.cointreau.fr''',
            '1 / 2 Cecile NGUYEN http://cecile.nguyen.site.voila.fr/ 7',
            'lien suivant http://www.gemo.tm.fr/recrutement/formulaire Gemo',
            'http://www.01podcast.com  -', 'http://bit.ly/j2JoOL',
            '2 ENFANTS - http://caroline-podevin.com/',
            '(http://www.3ds.com/customer-stories/)',
            '4D S.A\n    http://www.4D.fr.\n     ITK http://www.internet-toolkit.com (outil TCP/IP',
            '''this is   a url: http://live.textkernel.nl,
and you can chat with it''', '''following address:
http://eurlex.europa.eu/LexUriServ/LexUriServ.do?uri=CONSLEG:1962R0031:20140101:EN:PDF.
salary table please refer to CEOS:
http://eurlex.europa.eu/LexUriServ/LexUriServ.do?uri=CONSLEG:1962R0031:2014 0101:EN:PDF.
'''
        ]
        return text

    @staticmethod
    def get_long_url_text():
        text = '''following links:http://jovecat.gencat.cat/ca/temes/habitatge/allotjament_per_a_estudiants/http://www.resahousing.com/enghttps://www.uniplaces.com/accommodation/barcelonahttps://www.spotahome.com/barcelona?utm_source=adwords&utm_medium=cpc&gclid=CjwKCAiAwojkBRBbEiwAeRcJZGPFbPd5kVp8nEuA24G2t5wEjruvBW1yLQz1X2On_RP5KZwLTE2gAhoCVjEQAvD_BwEhttps://erasmusu.com/en/erasmus-barcelona/student-housing
  http://www.barcelona-student-housing.com/https://www.barcelonacheckin.com/en/r/barcelona_tourism_guide/articles/top-5-student-apartmentshttps://barcelona-home.com/blog/student-housing-barcelona/https://www.nestpick.com/student-apartments-barcelona/https://www.beroomers.com/blog/tag/student-accommodation-barcelona/https://www.rentbyowner.com/listing?q=Barcelona,%20Spain&gclid=CjwKCAiAwojkBRBbEiwAeRcJZFK7fahPsYmJtxqMu5JzZT4lxBVc8kbM-QrmsE37ri0Q7zijO6HEOxoC2zsQAvD_BwEhttp://ajuntament.barcelona.cat/joves/ca/noticia/nou-impuls-al-programa-viure-i-conviure place: http://osiresidences.com/ following links:https://expanish.com/social-activities-in-barcelona-spain/ https://www.timeout.es/barcelona/es http://guia.barcelona.cat/es http://lameva.barcelona.cat/barcelonacultura/es/home http://barcelona.lecool.com/

  https://www.moveo.cat/en/diploma-course/info-international-students/
  Compensation'''
        return text