class Tokenizer: def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient(annotators=['ssplit']) self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer def tokenize(self, doc: str) -> List[List[Token]]: corenlp_annotation = self.client.annotate(doc) sentences = [] for sentence in corenlp_annotation.sentence: text = doc[sentence.characterOffsetBegin:sentence. characterOffsetEnd] if self.do_lower_case: text = text.lower() offset = sentence.characterOffsetBegin bert_tokens = self.basic_tokenizer.tokenize(text) begin = 0 tokens = [] for bert_token in bert_tokens: word = bert_token begin = text.index(word, begin) end = begin + len(word) tokens.append(Token(word, begin + offset, end + offset)) begin = end if len(tokens) > 0: sentences.append(tokens) return sentences
class Tokenizer: def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient() self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer def __del__(self) -> None: for p in glob.glob('corenlp_server-*.props'): if os.path.isfile(p): os.remove(p) def tokenize(self, doc: str) -> List[Sentence]: splitter_annotation \ = self.client.annotate(doc, annotators=['ssplit'], properties={'tokenize.options': 'ptb3Escaping=false,invertible=true'}) end = 0 sentences = [] for sentence in splitter_annotation.sentence: begin = doc.index(sentence.token[0].originalText, end) for token in sentence.token: end = doc.index(token.originalText, end) + len( token.originalText) text = doc[begin:end] sentences.append(Sentence(text, begin, end)) sentences = self.fix_split(sentences) for sentence in sentences: text = sentence.text if self.do_lower_case: text = text.lower() bert_tokens = self.basic_tokenizer.tokenize(text) end = 0 tokens = [] for bert_token in bert_tokens: word = bert_token begin = text.index(word, end) end = begin + len(word) tokens.append( Token(word, sentence.begin + begin, sentence.begin + end)) assert len(tokens) > 0 sentence.tokens = tokens return sentences @staticmethod def fix_split(sentences: List[Sentence]) -> List[Sentence]: result = [] i = 0 while i < len(sentences): sentence = sentences[i] while True: next_sentence = sentences[ i + 1] if i < len(sentences) - 1 else None if '\n\n' in sentence.text: index = sentence.text.index('\n\n') new_sentence = Sentence(sentence.text[:index], sentence.begin, sentence.begin + index) result.append(new_sentence) index += re.search(r'[\n\t ]+', sentence.text[index:]).end() sentence.text = sentence.text[index:] sentence.begin += index elif next_sentence is not None and next_sentence.begin == sentence.end: sentence.text += next_sentence.text sentence.end = next_sentence.end i += 1 else: result.append(sentence) break i += 1 return result