def test_treebank_span_tokenizer(self): """ Test TreebankWordTokenizer.span_tokenize function """ tokenizer = TreebankWordTokenizer() # Test case in the docstring test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)." expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] result = tokenizer.span_tokenize(test1) self.assertEqual(result, expected) # Test case with double quotation test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues" expected = [(0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102), (103, 109)] result = tokenizer.span_tokenize(test2) self.assertEqual(result, expected) # Test case with double qoutation as well as converted quotations test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues" expected = [(0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96), (97, 99), (100, 106), (107, 113)] result = tokenizer.span_tokenize(test3) self.assertEqual(result, expected)
def test_document(self): """ test basic use of Tipster's Document class, detailed tests TODO """ # we are using naive text sample here, and a primitive PunktSentenceTokenizer to split the sentences # more advanced and practical sentence splitter is needed to handle abbreviations sample_txt = 'This is a sample text. It has two sentences.' true_toks = 'This is a sample text . It has two sentences .'.split() # add sentence annotations doc = Document(sample_txt) senttokenizer = PunktSentenceTokenizer() sents = senttokenizer.tokenize(sample_txt) sent_spans = senttokenizer.span_tokenize(sample_txt) for sent, sent_span in zip(sents, sent_spans): doc.annotate(sent_span[0], sent_span[1], 'sentence') # add token annotations treebanktokenizer = TreebankWordTokenizer() # use sentence annotation to retrieve sentences for sent in doc.get_annotations_by_type('sentence'): toks = treebanktokenizer.tokenize(doc.get_text_by_ann(sent)) spans = treebanktokenizer.span_tokenize(doc.get_text_by_ann(sent)) sent_base = sent.span.start for tok, tok_span in zip(toks, spans): span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base) doc.annotate(span.start, span.end, 'token') # check if all tokens are correct for tok, true_tok in zip(doc.get_annotations_by_type('token'), true_toks): self.assertTrue(doc.get_text_by_ann(tok) == true_tok)
class SimpleTokenizer(BaseTokenizer): def __init__(self, doc, tok_anno_name='token', sent_anno_name='sentence'): self.doc = doc self.tok_anno_name = tok_anno_name super(SimpleTokenizer, self).__init__(self.doc, self.tok_anno_name) self.sent_anno_name = sent_anno_name self.sentTokenizer = PunktSentenceTokenizer() self.treebankTokenizer = TreebankWordTokenizer() def tokenize(self, start_pos=0, text=None): if text is None: # if text not given, assume it spans to the end of doc text = self.doc.get_text_by_span( Span(start_pos, self.doc.get_length())) # add sentence annotations sents = self.sentTokenizer.tokenize(text) sent_spans = self.sentTokenizer.span_tokenize(text) for sent, sent_span in zip(sents, sent_spans): self.doc.annotate(start_pos + sent_span[0], start_pos + sent_span[1], self.sent_anno_name) # use sentence annotation to retrieve sentences for sent in self.doc.get_annotations_by_type(self.sent_anno_name): toks = self.treebankTokenizer.tokenize( self.doc.get_text_by_ann(sent)) spans = self.treebankTokenizer.span_tokenize( self.doc.get_text_by_ann(sent)) sent_base = sent.span.start for tok, tok_span in zip(toks, spans): span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base) self.doc.annotate(span.start, span.end, self.tok_anno_name)
class Parse(Service): def __init__(self, langs=[]): Service.__init__(self, 'parse', 'nltk', []) self.punktSentenceTokenizer = PunktSentenceTokenizer() self.treebankWordTokenizer = TreebankWordTokenizer() #PunkSentence, TreebankWordTokenizer def run(self, request, response): if request['lang'] == 'en': text = request['text'] debug = request.get('debug', False) result = [] for sent_s, sent_e in self.punktSentenceTokenizer.span_tokenize( text): tokens = [] sentence = text[sent_s:sent_e] for token_s, token_e in self.treebankWordTokenizer.span_tokenize( sentence): item = {'start': token_s + sent_s, 'end': token_e + sent_s} if debug: item['text'] = sentence[token_s:token_e] tokens.append(item) result.append(tokens) return result else: raise MissingLanguage(request['lang']) def describe(self): result = super().describe() result['langs'] = ['en'] result['models'] = {'en': {'pretrained': True}} return result
def test_treebank_span_tokenizer(self): """ Test TreebankWordTokenizer.span_tokenize function """ tokenizer = TreebankWordTokenizer() # Test case in the docstring test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)." expected = [ (0, 4), (5, 12), (13, 17), (18, 19), (19, 23), (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78) ] result = tokenizer.span_tokenize(test1) self.assertEqual(result, expected) # Test case with double quotation test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues" expected = [ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102), (103, 109) ] result = tokenizer.span_tokenize(test2) self.assertEqual(result, expected) # Test case with double qoutation as well as converted quotations test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues" expected = [ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96), (97, 99), (100, 106), (107, 113) ] result = tokenizer.span_tokenize(test3) self.assertEqual(result, expected)
class SpanTokenizer: """ 基于NLTK工具包,自定义一个详细版本的Tokenizer """ def __init__(self): self.tokenizer_big = PunktSentenceTokenizer() self.tokenizer_small = TreebankWordTokenizer() def tokenize(self, text): result = [] sentences_span = self.tokenizer_big.span_tokenize(text) for start, end in sentences_span: sentence = text[start:end] tokens_span = self.tokenizer_small.span_tokenize(sentence) for token_start, token_end in tokens_span: result.append([start + token_start, start + token_end]) return result
class Tokenizer: def __init__(self): self.tokenizer = TreebankWordTokenizer() def tokenize(self, text): return self.tokenizer.tokenize(text) def get_token_spans(self, text): return self.tokenizer.span_tokenize(text) def tokenize_document(self, out_dir, doc_text): token_set = AnnotationSet("tokens") tokens = self.tokenize(doc_text) spans = self.get_token_spans(doc_text) ann_id = 0 for token, span in zip(tokens, spans): token_set.add(Annotation(ann_id, token, "token", span[0], span[1])) ann_id += 1 token_set.write_annotation_file(out_dir)
def tokenize(cls, content: str, lang="en_GB") -> List["Word"]: language = languages[lang] symbols = [",", ".", ":", "?", '"', "-", "!", "(", ")", ";", "-"] words = [] t = TreebankWordTokenizer() spans = t.span_tokenize(content) for start, end in spans: word_content = content[start:end] if not word_content in symbols: sub_words = word_content.split("-") sub_word_start = start sub_word_end = start for s in sub_words: sub_word_end = sub_word_start + len(s) words.append(Word(s, sub_word_start, sub_word_end, lang)) sub_word_start = sub_word_end return words
class SemEval13InductionLoader(AbstractLoader): def __init__(self, fpath, n_surrounding=1): """ Parameters ---------- fpath : str The file path containing the XML file n_surrounding : int The number of surrounding sentences to capture context """ super(SemEval13InductionLoader, self).__init__(fpath, n_surrounding) self.tokenizer = TreebankWordTokenizer() self.sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle') fnames = sorted(glob.glob(fpath + "/*.xml")) for fname in fnames: tree = ET.parse(fname) root = tree.getroot() assert root.tag.lower() == "instances", "Root element must be named 'instances'" for instance in root: lexel = instance.get("lemma") + '.' + instance.get("partOfSpeech") iid = instance.get("id") start_offset = int(instance.get("tokenStart")) end_offset = int(instance.get("tokenEnd")) sentence = instance.text # tokenize sentences tok_sentence, char2tok = self.tokenize(sentence) head_instance = [] tok_offset = char2tok[start_offset] tok_end_offset = char2tok[end_offset-1] head_instance += [(t, '#', '#') for t in tok_sentence[:tok_offset]] head_instance.append((tok_sentence[tok_offset], lexel, iid)) head_instance += [(t, '*', '*') for t in tok_sentence[tok_offset+1:tok_end_offset+1]] head_instance += [(t, '#', '#') for t in tok_sentence[tok_end_offset+1:]] self.sent_instances.append([[], head_instance, []]) @classmethod def record_keys(cls, fpath): keys = {} with open_file(fpath, 'r') as f: for line in f: l = line.strip() toks = l.split(' ') key_str = toks[1] keys[toks[0]] = key_str f.close() return keys @classmethod def write_output(cls, iid, sense): return "{0} {1}".format(iid, sense) def remove_control_characters(self, s): return ''.join(ch for ch in s if category(ch)[0]!='C' or category(ch) == 'Cf') def tokenize(self, sentence): tokspans = self.tokenizer.span_tokenize(sentence) char2tok = {} tokens = [] for i, (s, e) in enumerate(tokspans): tokens.append(sentence[s:e]) for j in range(s, e): char2tok[j] = i return (tokens, char2tok)
class Senseval2LSLoader(AbstractLoader): def __init__(self, fpath, n_surrounding=1): """ Parameters ---------- fpath : str The file path containing the XML file n_surrounding : int The number of surrounding sentences to capture context """ super(Senseval2LSLoader, self).__init__(fpath, n_surrounding) parser = etree.XMLParser(dtd_validation=True) tree = ET.parse(fpath, parser) root = tree.getroot() assert root.tag.lower() == "corpus", "Root element must be named 'root'" self.tokenizer = TreebankWordTokenizer() self.sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle') for lexelt in root: lexel_orig = lexelt.get("item") print(lexel_orig, file=sys.stderr, flush=True) for instance in lexelt: # record instance ID iid = instance.get("id") context = instance.find('context') sentences = [] leftstr = context.text # everything before the head headelem = context[0] # head element headstr = headelem.text # left part of sentence: left context + head leftstr += headstr sentences += [self.remove_control_characters(s) for s in self.sent_segmenter.tokenize(leftstr.replace('\n', ' '))] sent_offset = len(sentences) - 1 tok_char_end_offset = len(sentences[-1]) # offset after the last char of head tok_char_offset = tok_char_end_offset - len(headstr) sats_attr = headelem.get("sats") rightstr = '' if headelem.tail is not None: rightstr += headelem.tail lexel_sat = None if sats_attr is not None: lexel_sat = sats_attr.split(' ')[0].split('.')[0] + '.' + lexel.split('.')[-1] for i in range(1, len(context)): rightstr += context[i].text rightstr += context[i].tail righttoks = [self.remove_control_characters(s) for s in self.sent_segmenter.tokenize(rightstr.replace('\n', ' '))] if len(righttoks) > 0: sentences[-1] += righttoks[0] sentences += righttoks[1:] # tokenize sentences tok_sentences, char2toks = zip(*[self.tokenize(sen) for sen in sentences]) # create instances left_instance = [] left_begin = max(0, sent_offset - n_surrounding) if n_surrounding >= 0 else 0 for i in range(left_begin, sent_offset): left_instance += [(t, '#', '#') for t in tok_sentences[i]] + [("[SEP]", '#', '#')] head_instance = [] tok_offset = char2toks[sent_offset][tok_char_offset] tok_end_offset = char2toks[sent_offset][tok_char_end_offset-1] head_instance += [(t, '#', '#') for t in tok_sentences[sent_offset][:tok_offset]] lexel = lexel_sat if lexel_sat is not None else lexel_orig head_instance.append((tok_sentences[sent_offset][tok_offset], lexel, iid)) head_instance += [(t, '*', '*') for t in tok_sentences[sent_offset][tok_offset+1:tok_end_offset+1]] head_instance += [(t, '#', '#') for t in tok_sentences[sent_offset][tok_end_offset+1:]] right_instance = [] right_end = min(sent_offset + n_surrounding + 1, len(sentences)) for i in range(sent_offset + 1, right_end): right_instance += [("[SEP]", '#', '#')] + [(t, '#', '#') for t in tok_sentences[i]] self.sent_instances.append([left_instance, head_instance, right_instance]) @classmethod def record_keys(cls, fpath): keys = {} with open_file(fpath, 'r') as f: for line in f: l = line.strip() toks = l.split(' ') iid = toks[1] key_strs = [t for t in toks[2:] if (t != 'P' and t != 'U')] key_str = 'U' if len(key_strs) == 0 else key_strs[0] keys[iid] = key_str f.close() return keys @classmethod def write_output(cls, iid, sense): return "{0} {1} {2}".format(iid.split('.')[0], iid, sense) def remove_control_characters(self, s): return ''.join(ch for ch in s if category(ch)[0]!='C' or category(ch) == 'Cf') def tokenize(self, sentence): tokspans = self.tokenizer.span_tokenize(sentence) char2tok = {} tokens = [] for i, (s, e) in enumerate(tokspans): tokens.append(sentence[s:e]) for j in range(s, e): char2tok[j] = i return (tokens, char2tok)