def tag_phrase(self, phrase, skip_sentences_longer_than): tagged_words = [] skip_token_mid = False # for double continue on token found skip_token_end = False # for double continue on token found words = tokenizer.tokenize_in_words(phrase) words_len = len(words) for idx, word in enumerate(words): if skip_token_mid: skip_token_mid = False continue if skip_token_end: skip_token_end = False continue if word == '{': if idx + 1 < words_len and words[idx + 1].startswith('m.'): mid = words[idx + 1].split(':')[0] if idx + 2 < words_len and words[idx + 2] == '}': try: entity_mid, entity_name, entity_types = get_all_entity_properties_by_id( mid) entity_tag = self.tagger.tag(entity_types) entity_tag = ','.join(entity_tag) entity_name_list = tokenizer.tokenize_in_words( entity_name) for entity_part in entity_name_list: tagged_words.append((entity_part, entity_tag)) word_to_replace = '{' + words[idx + 1] + '}' phrase.replace(word_to_replace, entity_name) except ValueError: raise skip_token_mid = True skip_token_end = True else: # rare case tagged_words.append((word, 'O')) else: tagged_words.append((word, 'O')) else: tagged_words.append((word, 'O')) tagged_words_len = len(tagged_words) if tagged_words_len > skip_sentences_longer_than: raise TooLongException( 'Too long sentence. Found {} tokens.'.format(tagged_words_len)) return tagged_words, phrase
def tag_triple(e1, relation, e2): word_bio = [] for ix, token in enumerate(tokenizer.tokenize_in_words(e1)): if ix == 0: word_bio.append((token, 'B')) else: word_bio.append((token, 'I')) for token in tokenizer.tokenize_in_words(relation): word_bio.append((token, 'O')) for ix, token in enumerate(tokenizer.tokenize_in_words(e2)): if ix == 0: word_bio.append((token, 'B')) else: word_bio.append((token, 'I')) return word_bio
def predict_sentence(self, sentence): tokenized_sentence = tokenizer.tokenize_in_words(sentence) return self.predict_tokenized_sentence(tokenized_sentence)
def __iter__(self): for line in open(self.filepath): if self.use_tokenizer: yield tokenizer.tokenize_in_words(line) else: yield line.split() # faster but not accurate