def __init__( self, normtable=None, gazetteer=None, codetable=None, blacklist=None, regexp=None, with_context=False ): self.regexp = regexp self.tokenizer = Tokenizer(self.regexp) self.blacklist = dict() self.with_context = with_context if normtable: match_patterns = PatternsNT(self.tokenizer, normtable) elif gazetteer: match_patterns = PatternsGZ(self.tokenizer, gazetteer) elif codetable: match_patterns = PatternsCT(self.tokenizer, codetable) else: raise Exception('source file is required to build a \ Matcher object') self.code_property_mapping = match_patterns.codeid_description self.meta_info = match_patterns.meta_info if blacklist: self.blacklist = data_utils.from_file_to_list(blacklist) self.trie_matcher = TokenTrie( patterns=match_patterns.tokenized_pattern )
def tokenize(string): '''tokenize string, and return the list of normalized tokens''' string = normalize_chars(string) try: TOKENIZER # NOQA except NameError: TOKENIZER = Tokenizer() return [_norm_token(token) for token in TOKENIZER.tokenize(string)]
def main(): args = get_args() trxml_miner = TRXMLMiner(TEXT_FIELD) if os.path.isdir(args.input): files = [ os.path.join(args.input, file) for file in os.listdir(args.input) ] elif os.path.isfile(args.input): files = [args.input] else: raise ValueError('Input not exist') model, vocab_to_ids = _load_model_and_vocab(args) tokenizer = Tokenizer() for file in files: selected_value = list(trxml_miner.mine(file)) input_text = selected_value[0]['values'][TEXT_FIELD] data = _input_text_to_pad_id(input_text, vocab_to_ids, tokenizer) result = model(data) probabilities = result['probabilities'][0] print(file, probabilities)
class TokenTrieTestCases(TestCase): def setUp(self): self.gz_file = 'tests/resource/gazetteer.txt' self.text = 'ab Foo bar Foo foo chao foo\nBar foo bar foo foo' self.tokenizer = Tokenizer() def test_append_list_to_local_trie(self): token_trie = TokenTrie(patterns=None) patterns = TokenizedPattern(["a", "b", "c", "d"]) self.assertEqual( token_trie._append_token_list_to_trie(patterns), {'a': {'b': {'c': {'d': {'xxENDxx': ('', None)}}}}} ) patterns = TokenizedPattern(["a"]) self.assertEqual( token_trie._append_token_list_to_trie(patterns), {'a': {'xxENDxx':('', None)}} ) patterns = TokenizedPattern([]) self.assertEqual( token_trie._append_token_list_to_trie(patterns), {'xxENDxx':('', None)} ) def test_build_from_patterns(self): patterns = ["foo bar", "foo", "foo bar foo", "bar bar foo", "bar foo foo"] tokenzied_patterns = ( TokenizedPattern(self.tokenizer.tokenize(normalize(pattern)), pattern) for pattern in patterns ) token_trie = TokenTrie(patterns=tokenzied_patterns) self.assertEqual(token_trie.token_trie, {'bar': {'bar': {'foo': {'xxENDxx': ('bar bar foo', None)} }, 'foo': {'foo': {'xxENDxx': ('bar foo foo', None)} } }, 'foo': {'bar': {'foo': {'xxENDxx': ('foo bar foo', None)}, 'xxENDxx': ('foo bar', None)}, 'xxENDxx': ('foo', None)} }) def test_build_from_repeated_pattern(self): patterns = ['b tree', 'b - tree', 'B - tree', 'AS/400', 'as 400'] tokenzied_patterns = ( TokenizedPattern(self.tokenizer.tokenize(normalize(pattern)), pattern) for pattern in patterns ) token_trie = TokenTrie(patterns=tokenzied_patterns) self.assertEqual(token_trie.token_trie, {'b': {'tree': {'xxENDxx': ('b tree', None)}, '-': {'tree': {'xxENDxx': ('B - tree', None)} } }, 'as': {'/': {'400': {'xxENDxx': ('AS/400', None)} }, '400': {'xxENDxx': ('as 400', None)} } } ) def test_match_at_position(self): patterns = PatternsGZ(self.tokenizer, self.gz_file) token_trie = TokenTrie(patterns=patterns.tokenized_pattern) tokens = self.tokenizer.tokenize_with_pos_info(normalize(self.text)) self.assertEqual( [ [token.text for token in match.tokens] for match in (token_trie.match_at_position( token_trie.token_trie, tokens[1:]) ) ], [['foo', 'bar'], ['foo', 'bar', 'foo']] ) self.assertEqual( [ [token.text for token in match.tokens] for match in (token_trie.match_at_position( token_trie.token_trie, tokens[2:]) ) ], [['bar', 'foo', 'foo']] ) self.assertEqual( [ [token.text for token in match.tokens] for match in (token_trie.match_at_position( token_trie.token_trie, tokens[3:]) ) ], [] ) def test_longest_match_at_position(self): patterns = PatternsGZ(self.tokenizer, self.gz_file) token_trie = TokenTrie(patterns=patterns.tokenized_pattern) tokens = self.tokenizer.tokenize_with_pos_info(normalize(self.text)) self.assertEqual( [ token.text for token in token_trie.longest_match_at_position( token_trie.token_trie, tokens[1:]).tokens ], ['foo', 'bar', 'foo'] ) self.assertEqual( [ token.text for token in token_trie.longest_match_at_position( token_trie.token_trie, tokens[2:]).tokens ], ['bar', 'foo', 'foo'] ) self.assertEqual( [ token.text for token in token_trie.longest_match_at_position( token_trie.token_trie, tokens[3:]) ], [] )
def setUp(self): self.gz_file = 'tests/resource/gazetteer.txt' self.text = 'ab Foo bar Foo foo chao foo\nBar foo bar foo foo' self.tokenizer = Tokenizer()
class Matcher(): ''' Matcher finds all matched phrases from the input text. It contains - an internal tokenizer applied on both patterns and input text - a token trie structure created eithor from gazetteer or codetable - a dictionary to map codeID to code_description and code_category Params: - normtable: normalized table file in json format - gazetteer: gazetteer file - codetable: taxonomy codetable format file - blacklist: blacklist file - with_context: also output the context if set to True ''' # the magic number is related to the average length of the context in the # training data CONTEXT_LENGTH = 14 def __init__( self, normtable=None, gazetteer=None, codetable=None, blacklist=None, regexp=None, with_context=False ): self.regexp = regexp self.tokenizer = Tokenizer(self.regexp) self.blacklist = dict() self.with_context = with_context if normtable: match_patterns = PatternsNT(self.tokenizer, normtable) elif gazetteer: match_patterns = PatternsGZ(self.tokenizer, gazetteer) elif codetable: match_patterns = PatternsCT(self.tokenizer, codetable) else: raise Exception('source file is required to build a \ Matcher object') self.code_property_mapping = match_patterns.codeid_description self.meta_info = match_patterns.meta_info if blacklist: self.blacklist = data_utils.from_file_to_list(blacklist) self.trie_matcher = TokenTrie( patterns=match_patterns.tokenized_pattern ) def matching(self, text): ''' find all matching phrases from the input text params: - text: string output: - all matching phrases as MatchedPhrase object ''' tokens = self.tokenizer.tokenize_with_pos_info(text) for token in tokens: token.text = data_utils.normalize(token.text) idx = 0 nr_tokens = len(tokens) while idx < nr_tokens: local_match = self.trie_matcher.longest_match_at_position( self.trie_matcher.token_trie, tokens[idx:]) if local_match: start_pos, end_pos = local_match.text_range() left_context, right_context = self.prepare_context(tokens, local_match, idx, text) surface_form = local_match.surface_form matched_text = text[start_pos:end_pos] yield MatchedPhrase( surface_form, matched_text, start_pos, end_pos - 1, # prepare for the entity fromwork (in perl) local_match.code_id, self.code_id_property_lookup(local_match.code_id, 'desc'), self.code_id_property_lookup(local_match.code_id, 'type'), left_context, right_context, self.code_id_property_lookup( local_match.code_id, 'skill_likelihoods', dict() ).get(data_utils.normalize(surface_form), None) ) idx += len(local_match.tokens) else: idx += 1 def prepare_context(self, tokens, local_match, idx, text): l_context = '' r_context = '' if self.with_context: nr_matched_tokens = len(local_match.tokens) l_context_begin = max(0, idx - self.CONTEXT_LENGTH) l_context_end = idx r_context_begin = idx + nr_matched_tokens r_context_end = min( len(tokens), r_context_begin + self.CONTEXT_LENGTH ) if l_context_begin < l_context_end: l_context = text[tokens[l_context_begin].start_pos: tokens[l_context_end - 1].end_pos] if r_context_begin < r_context_end: r_context = text[tokens[r_context_begin].start_pos: tokens[r_context_end - 1].end_pos] return l_context, r_context def code_id_property_lookup(self, code_id, property_name, default=None): code_property = default if code_id is not None: if code_id in self.code_property_mapping: code_property = self.code_property_mapping[code_id].get( property_name, default) else: LOGGER.warning( 'WARNING: no property {} for codeid: {}'. format(property_name, code_id) ) return code_property
import re from easy_tokenizer.normalizer import normalize_chars from easy_tokenizer.tokenizer import Tokenizer from easy_tokenizer.patterns import Patterns HAS_TOKEN_REGEXP = re.compile(r'\w') TOKEN_REGEXP = re.compile(r'\w+|[^\w\s]+') TOKENIZER = Tokenizer() def tokenize(string): '''tokenize string, and return the list of normalized tokens''' string = normalize_chars(string) try: TOKENIZER # NOQA except NameError: TOKENIZER = Tokenizer() return [_norm_token(token) for token in TOKENIZER.tokenize(string)] def _norm_token(token): norm_token = token.upper() if Patterns.URL_RE.fullmatch(token): norm_token = 'xxURLxx' elif Patterns.EMAIL_RE.fullmatch(token): norm_token = 'xxEMAILxx' elif Patterns.DOMAIN_RE.fullmatch(token): norm_token = 'xxAT_DOMAINxx' elif Patterns.YEAR_RE.fullmatch(token): norm_token = 'xxYEARxx'
def setUp(self): self.tokenizer = Tokenizer()
class TokenizerTestCases(TestCase): def setUp(self): self.tokenizer = Tokenizer() def test_simple_sentence(self): text = ['this is a normal sentence', 'this is a normal-sentence'] expected_tokens = [['this', 'is', 'a', 'normal', 'sentence'], ['this', 'is', 'a', 'normal', '-', 'sentence']] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_char_with_hat(self): text = [ 'Alp NADİ\nCoordination potential bidders.', 'Hurmoğlu, Botaş BTC, 2002\nCoordination of packages.' ] expected_tokens = [[ 'Alp', 'NADİ', 'Coordination', 'potential', 'bidders', '.' ], [ 'Hurmoğlu', ',', 'Botaş', 'BTC', ',', '2002', 'Coordination', 'of', 'packages', '.' ]] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_tokenize_hyphen(self): text = [ 'this is a normal-sentence', 'he is my ex-boyfriend', 'Sociale Option PME-PMI' ] expected_tokens = [['this', 'is', 'a', 'normal', '-', 'sentence'], ['he', 'is', 'my', 'ex-boyfriend'], ['Sociale', 'Option', 'PME', '-', 'PMI']] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_tokenize_slash(self): text = ['Programmation C/C++/Java/PHP/Perl/JSP.'] expected_tokens = [[ 'Programmation', 'C', '/', 'C++', '/', 'Java', '/', 'PHP', '/', 'Perl', '/', 'JSP.' ]] for index in range(len(expected_tokens)): tokens = list(self.tokenizer.tokenize(text[index])) self.assertEqual(tokens, expected_tokens[index]) def test_tokenize_phrase(self): text = [ 'office(XP)', '(ex)girlfriend', 'Happy(12)Birthday', 'een#exception', 'AS/400' ] expected_tokens = [["office", "(", "XP", ")"], ["(", "ex", ")", "girlfriend"], ["Happy", "(", "12", ")", "Birthday"], ['een#exception'], ['AS', '/', '400']] for index in range(len(expected_tokens)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_old_tokenizer_cases(self): text = [ "een mooie test zin.", "foo...", "foo...bar", "l'agence", "d'origine", "foo,bar", "foo.bar", "Aug. foo.", '[email protected] [email protected] [email protected]', 'http://www.viadeo.com/profile/34mies', '..... -----', '.net c++ F#', '$100,000.99 10.12 10,12 10.123 10,123 10.000,12', '1st', 'S.I.M. S I M S. I. M. SIM.', 'end of "phrase".' ] expected_tokens = [ ['een', 'mooie', 'test', 'zin', '.'], ['foo', '.', '.', '.'], ['foo', '.', '.', '.', 'bar'], ["l", "'", "agence"], ["d", "'", "origine"], ['foo', ',', 'bar'], ['foo.bar'], ['Aug.', 'foo', '.'], ['*****@*****.**', '*****@*****.**', '*****@*****.**'], ['http://www.viadeo.com/profile/34mies'], ['.....', '-----'], ['.net', 'c++', 'F#'], [ '$', '100,000.99', '10.12', '10,12', '10.123', '10,123', '10.000,12' ], ['1st'], ['S.I.M.', 'S', 'I', 'M', 'S.', 'I.', 'M.', 'SIM.'], ['end', 'of', '"', 'phrase', '"', '.'] ] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_digits_extraction(self): text = ['BSS (S4d0, S4D1), VOIP'] expected_tokens = [['BSS', '(', 'S4d0', ',', 'S4D1', ')', ',', 'VOIP']] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_dot_comman_extraction(self): text = ['2000 a dec.01', 'HP-UX, C++, BASIC', 'java, c++, python'] expected_tokens = [['2000', 'a', 'dec.', '01'], ['HP-UX', ',', 'C++', ',', 'BASIC'], ['java', ',', 'c++', ',', 'python']] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) def test_url_email(self): text = '''http://live.textkernel.nl", [email protected] and this is another one, here https://www.damienpontifex.com/ch/05/06/image-classifier/ or try this one https://docs.google.com/document/d/1pd/edit?ts=5da580bc''' self.assertEqual(self.tokenizer.tokenize(text), [ 'http://live.textkernel.nl', '"', ',', '*****@*****.**', 'and', 'this', 'is', 'another', 'one', ',', 'here', 'https://www.damienpontifex.com/ch/05/06/image-classifier/', 'or', 'try', 'this', 'one', 'https://docs.google.com/document/d/1pd/edit?ts=5da580bc' ]) def test_email(self): text = ["e-mail : [email protected]\ngsm"] self.assertEqual( [self.tokenizer.tokenize(phrase) for phrase in text], [['e-mail', ':', '*****@*****.**', 'gsm']]) def test_url_extraction(self): text = self.get_url_text() expected_tokens = [[ 'REFERENCES', 'http://www.erdfdistribution.fr', 'http://www.idtgv.com', 'http://www.edf-bleuciel.fr', 'idnight.idtgv.com', 'http://investisseurs.edf.com', 'http://www.metaxa.com', 'ina.edf.com', 'http://www.hutchinson.fr', 'http://www.lagardere.com', 'http://www.direxi.fr', 'http://www.fondation-jeanluclagardere.com', 'http://www.blog-des-astucieuses.fr', 'http://www.investir.fr', 'http://www.clubnewsparis.com', 'http://www.cointreau.fr' ], [ '1', '/', '2', 'Cecile', 'NGUYEN', 'http://cecile.nguyen.site.voila.fr/', '7' ], [ 'lien', 'suivant', 'http://www.gemo.tm.fr/recrutement/formulaire', 'Gemo' ], ['http://www.01podcast.com', '-'], ['http://bit.ly/j2JoOL'], [ '2', 'ENFANTS', '-', 'http://caroline-podevin.com/' ], ['(', 'http://www.3ds.com/customer-stories/', ')'], [ '4D', 'S.A', 'http://www.4D.fr', '.', 'ITK', 'http://www.internet-toolkit.com', '(', 'outil', 'TCP', '/', 'IP' ], [ 'this', 'is', 'a', 'url', ':', 'http://live.textkernel.nl', ',', 'and', 'you', 'can', 'chat', 'with', 'it' ]] for index in range(len(expected_tokens)): tokens = list(self.tokenizer.tokenize(text[index])) self.assertEqual(tokens, expected_tokens[index]) def test_long_url(self): text = self.get_long_url_text() tokens = list(self.tokenizer.tokenize(text)) #print("\n\n".join(tokens)) self.assertEqual(len(tokens), 26) def test_word_cancatenate_nonword(self): text = [ "**Français : Niveau élevé", "*Français", "*Anglais. (Scolaire)", "Arabe...", "Arabe..", "#net", "??Arabe", "'abc', why!!!" ] expected_tokens = [['*', '*', 'Français', ':', 'Niveau', 'élevé'], ['*', 'Français'], ['*', 'Anglais', '.', '(', 'Scolaire', ')'], ['Arabe', '.', '.', '.'], ['Arabe', '.', '.'], ['#net'], ['?', '?', 'Arabe'], ['\'', 'abc', '\'', ',', 'why', '!', '!', '!']] for index in range(len(text)): self.assertEqual(list(self.tokenizer.tokenize(text[index])), expected_tokens[index]) @staticmethod def get_url_text(): text = [ '''REFERENCES http://www.erdfdistribution.fr http://www.idtgv.com http://www.edf-bleuciel.fr idnight.idtgv.com http://investisseurs.edf.com http://www.metaxa.com ina.edf.com http://www.hutchinson.fr http://www.lagardere.com http://www.direxi.fr http://www.fondation-jeanluclagardere.com http://www.blog-des-astucieuses.fr http://www.investir.fr http://www.clubnewsparis.com http://www.cointreau.fr''', '1 / 2 Cecile NGUYEN http://cecile.nguyen.site.voila.fr/ 7', 'lien suivant http://www.gemo.tm.fr/recrutement/formulaire Gemo', 'http://www.01podcast.com -', 'http://bit.ly/j2JoOL', '2 ENFANTS - http://caroline-podevin.com/', '(http://www.3ds.com/customer-stories/)', '4D S.A\n http://www.4D.fr.\n ITK http://www.internet-toolkit.com (outil TCP/IP', '''this is a url: http://live.textkernel.nl, and you can chat with it''', '''following address: http://eurlex.europa.eu/LexUriServ/LexUriServ.do?uri=CONSLEG:1962R0031:20140101:EN:PDF. salary table please refer to CEOS: http://eurlex.europa.eu/LexUriServ/LexUriServ.do?uri=CONSLEG:1962R0031:2014 0101:EN:PDF. ''' ] return text @staticmethod def get_long_url_text(): text = '''following links:http://jovecat.gencat.cat/ca/temes/habitatge/allotjament_per_a_estudiants/http://www.resahousing.com/enghttps://www.uniplaces.com/accommodation/barcelonahttps://www.spotahome.com/barcelona?utm_source=adwords&utm_medium=cpc&gclid=CjwKCAiAwojkBRBbEiwAeRcJZGPFbPd5kVp8nEuA24G2t5wEjruvBW1yLQz1X2On_RP5KZwLTE2gAhoCVjEQAvD_BwEhttps://erasmusu.com/en/erasmus-barcelona/student-housing http://www.barcelona-student-housing.com/https://www.barcelonacheckin.com/en/r/barcelona_tourism_guide/articles/top-5-student-apartmentshttps://barcelona-home.com/blog/student-housing-barcelona/https://www.nestpick.com/student-apartments-barcelona/https://www.beroomers.com/blog/tag/student-accommodation-barcelona/https://www.rentbyowner.com/listing?q=Barcelona,%20Spain&gclid=CjwKCAiAwojkBRBbEiwAeRcJZFK7fahPsYmJtxqMu5JzZT4lxBVc8kbM-QrmsE37ri0Q7zijO6HEOxoC2zsQAvD_BwEhttp://ajuntament.barcelona.cat/joves/ca/noticia/nou-impuls-al-programa-viure-i-conviure place: http://osiresidences.com/ following links:https://expanish.com/social-activities-in-barcelona-spain/ https://www.timeout.es/barcelona/es http://guia.barcelona.cat/es http://lameva.barcelona.cat/barcelonacultura/es/home http://barcelona.lecool.com/ https://www.moveo.cat/en/diploma-course/info-international-students/ Compensation''' return text