예제 #1
0
    def test_treebank_span_tokenizer(self):
        """
        Test TreebankWordTokenizer.span_tokenize function
        """

        tokenizer = TreebankWordTokenizer()

        # Test case in the docstring
        test1 = "Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks)."
        expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), (24, 26),
                    (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), (40, 46),
                    (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), (60, 62),
                    (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
        result = tokenizer.span_tokenize(test1)
        self.assertEqual(result, expected)

        # Test case with double quotation
        test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
        expected = [(0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25),
                    (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50),
                    (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 85),
                    (86, 92), (93, 95), (96, 102), (103, 109)]
        result = tokenizer.span_tokenize(test2)
        self.assertEqual(result, expected)

        # Test case with double qoutation as well as converted quotations
        test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
        expected = [(0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25),
                    (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50),
                    (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 79),
                    (79, 87), (87, 89), (90, 96), (97, 99), (100, 106),
                    (107, 113)]
        result = tokenizer.span_tokenize(test3)
        self.assertEqual(result, expected)
예제 #2
0
    def test_document(self):
        """
    test basic use of Tipster's Document class, detailed tests TODO
    """
        # we are using naive text sample here, and a primitive PunktSentenceTokenizer to split the sentences
        # more advanced and practical sentence splitter is needed to handle abbreviations
        sample_txt = 'This is a sample text. It has two sentences.'
        true_toks = 'This is a sample text . It has two sentences .'.split()

        # add sentence annotations
        doc = Document(sample_txt)
        senttokenizer = PunktSentenceTokenizer()
        sents = senttokenizer.tokenize(sample_txt)
        sent_spans = senttokenizer.span_tokenize(sample_txt)
        for sent, sent_span in zip(sents, sent_spans):
            doc.annotate(sent_span[0], sent_span[1], 'sentence')

        # add token annotations
        treebanktokenizer = TreebankWordTokenizer()
        # use sentence annotation to retrieve sentences
        for sent in doc.get_annotations_by_type('sentence'):
            toks = treebanktokenizer.tokenize(doc.get_text_by_ann(sent))
            spans = treebanktokenizer.span_tokenize(doc.get_text_by_ann(sent))
            sent_base = sent.span.start
            for tok, tok_span in zip(toks, spans):
                span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base)
                doc.annotate(span.start, span.end, 'token')

        # check if all tokens are correct
        for tok, true_tok in zip(doc.get_annotations_by_type('token'),
                                 true_toks):
            self.assertTrue(doc.get_text_by_ann(tok) == true_tok)
예제 #3
0
class SimpleTokenizer(BaseTokenizer):
    def __init__(self, doc, tok_anno_name='token', sent_anno_name='sentence'):
        self.doc = doc
        self.tok_anno_name = tok_anno_name
        super(SimpleTokenizer, self).__init__(self.doc, self.tok_anno_name)
        self.sent_anno_name = sent_anno_name
        self.sentTokenizer = PunktSentenceTokenizer()
        self.treebankTokenizer = TreebankWordTokenizer()

    def tokenize(self, start_pos=0, text=None):
        if text is None:  # if text not given, assume it spans to the end of doc
            text = self.doc.get_text_by_span(
                Span(start_pos, self.doc.get_length()))

        # add sentence annotations
        sents = self.sentTokenizer.tokenize(text)
        sent_spans = self.sentTokenizer.span_tokenize(text)
        for sent, sent_span in zip(sents, sent_spans):
            self.doc.annotate(start_pos + sent_span[0],
                              start_pos + sent_span[1], self.sent_anno_name)

        # use sentence annotation to retrieve sentences
        for sent in self.doc.get_annotations_by_type(self.sent_anno_name):
            toks = self.treebankTokenizer.tokenize(
                self.doc.get_text_by_ann(sent))
            spans = self.treebankTokenizer.span_tokenize(
                self.doc.get_text_by_ann(sent))
            sent_base = sent.span.start
            for tok, tok_span in zip(toks, spans):
                span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base)
                self.doc.annotate(span.start, span.end, self.tok_anno_name)
예제 #4
0
class Parse(Service):
    def __init__(self, langs=[]):
        Service.__init__(self, 'parse', 'nltk', [])
        self.punktSentenceTokenizer = PunktSentenceTokenizer()
        self.treebankWordTokenizer = TreebankWordTokenizer()
        #PunkSentence, TreebankWordTokenizer

    def run(self, request, response):
        if request['lang'] == 'en':
            text = request['text']
            debug = request.get('debug', False)
            result = []
            for sent_s, sent_e in self.punktSentenceTokenizer.span_tokenize(
                    text):
                tokens = []
                sentence = text[sent_s:sent_e]
                for token_s, token_e in self.treebankWordTokenizer.span_tokenize(
                        sentence):
                    item = {'start': token_s + sent_s, 'end': token_e + sent_s}
                    if debug:
                        item['text'] = sentence[token_s:token_e]
                    tokens.append(item)
                result.append(tokens)
            return result
        else:
            raise MissingLanguage(request['lang'])

    def describe(self):
        result = super().describe()
        result['langs'] = ['en']
        result['models'] = {'en': {'pretrained': True}}
        return result
예제 #5
0
    def test_treebank_span_tokenizer(self):
        """
        Test TreebankWordTokenizer.span_tokenize function
        """

        tokenizer = TreebankWordTokenizer()

        # Test case in the docstring
        test1 = "Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks)."
        expected = [
            (0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)
        ]
        result = tokenizer.span_tokenize(test1)
        self.assertEqual(result, expected)

        # Test case with double quotation
        test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
        expected = [
            (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
            (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
            (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102),
            (103, 109)
        ]
        result = tokenizer.span_tokenize(test2)
        self.assertEqual(result, expected)

        # Test case with double qoutation as well as converted quotations
        test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
        expected = [
            (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
            (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
            (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96),
            (97, 99), (100, 106), (107, 113)
        ]
        result = tokenizer.span_tokenize(test3)
        self.assertEqual(result, expected)
예제 #6
0
class SpanTokenizer:
    """
    基于NLTK工具包,自定义一个详细版本的Tokenizer
    """
    def __init__(self):
        self.tokenizer_big = PunktSentenceTokenizer()
        self.tokenizer_small = TreebankWordTokenizer()

    def tokenize(self, text):
        result = []
        sentences_span = self.tokenizer_big.span_tokenize(text)
        for start, end in sentences_span:
            sentence = text[start:end]
            tokens_span = self.tokenizer_small.span_tokenize(sentence)
            for token_start, token_end in tokens_span:
                result.append([start + token_start, start + token_end])
        return result
예제 #7
0
class Tokenizer:
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

    def get_token_spans(self, text):
        return self.tokenizer.span_tokenize(text)

    def tokenize_document(self, out_dir, doc_text):
        token_set = AnnotationSet("tokens")
        tokens = self.tokenize(doc_text)
        spans = self.get_token_spans(doc_text)
        ann_id = 0
        for token, span in zip(tokens, spans):
            token_set.add(Annotation(ann_id, token, "token", span[0], span[1]))
            ann_id += 1
        token_set.write_annotation_file(out_dir)
예제 #8
0
    def tokenize(cls, content: str, lang="en_GB") -> List["Word"]:
        language = languages[lang]
        symbols = [",", ".", ":", "?", '"', "-", "!", "(", ")", ";", "-"]
        words = []

        t = TreebankWordTokenizer()
        spans = t.span_tokenize(content)

        for start, end in spans:
            word_content = content[start:end]
            if not word_content in symbols:
                sub_words = word_content.split("-")
                sub_word_start = start
                sub_word_end = start
                for s in sub_words:
                    sub_word_end = sub_word_start + len(s)
                    words.append(Word(s, sub_word_start, sub_word_end, lang))
                    sub_word_start = sub_word_end

        return words
예제 #9
0
class SemEval13InductionLoader(AbstractLoader):
    def __init__(self, fpath, n_surrounding=1):
        """
        Parameters
        ----------
        fpath : str
            The file path containing the XML file
        n_surrounding : int
            The number of surrounding sentences to capture context
        """
        super(SemEval13InductionLoader, self).__init__(fpath, n_surrounding)
        self.tokenizer = TreebankWordTokenizer()
        self.sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')
        fnames = sorted(glob.glob(fpath + "/*.xml"))
        for fname in fnames:
            tree = ET.parse(fname)
            root = tree.getroot()
            assert root.tag.lower() == "instances", "Root element must be named 'instances'"
            for instance in root:
                lexel = instance.get("lemma") + '.' + instance.get("partOfSpeech")
                iid = instance.get("id")
                start_offset = int(instance.get("tokenStart"))
                end_offset = int(instance.get("tokenEnd"))
                sentence = instance.text
                
                # tokenize sentences
                tok_sentence, char2tok = self.tokenize(sentence)
                
                head_instance = []
                tok_offset = char2tok[start_offset]
                tok_end_offset = char2tok[end_offset-1]

                head_instance += [(t, '#', '#') for t in tok_sentence[:tok_offset]]
                head_instance.append((tok_sentence[tok_offset], lexel, iid))
                head_instance += [(t, '*', '*') for t in tok_sentence[tok_offset+1:tok_end_offset+1]]
                head_instance += [(t, '#', '#') for t in tok_sentence[tok_end_offset+1:]]

                self.sent_instances.append([[], head_instance, []])
    
    @classmethod
    def record_keys(cls, fpath):
        keys = {}
        with open_file(fpath, 'r') as f:
            for line in f:
                l = line.strip()
                toks = l.split(' ')
                key_str = toks[1]
                keys[toks[0]] = key_str
            f.close()
        return keys

    @classmethod
    def write_output(cls, iid, sense):
        return "{0} {1}".format(iid, sense)

    def remove_control_characters(self, s):
        return ''.join(ch for ch in s if category(ch)[0]!='C' or category(ch) == 'Cf')

    def tokenize(self, sentence):
        tokspans = self.tokenizer.span_tokenize(sentence)
        char2tok = {}
        tokens = []
        for i, (s, e) in enumerate(tokspans):
            tokens.append(sentence[s:e])
            for j in range(s, e):
                char2tok[j] = i
        return (tokens, char2tok)
예제 #10
0
class Senseval2LSLoader(AbstractLoader):
    def __init__(self, fpath, n_surrounding=1):
        """
        Parameters
        ----------
        fpath : str
            The file path containing the XML file
        n_surrounding : int
            The number of surrounding sentences to capture context
        """
        super(Senseval2LSLoader, self).__init__(fpath, n_surrounding)
        parser = etree.XMLParser(dtd_validation=True)
        tree = ET.parse(fpath, parser)
        root = tree.getroot()
        assert root.tag.lower() == "corpus", "Root element must be named 'root'"
        self.tokenizer = TreebankWordTokenizer()
        self.sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')
        for lexelt in root:
            lexel_orig = lexelt.get("item")
            print(lexel_orig, file=sys.stderr, flush=True)
            for instance in lexelt:
                # record instance ID
                iid = instance.get("id")
                context = instance.find('context')
                sentences = []
                leftstr = context.text  # everything before the head
                headelem = context[0]  # head element
                headstr = headelem.text

                # left part of sentence: left context + head
                leftstr += headstr
                sentences += [self.remove_control_characters(s) for s in self.sent_segmenter.tokenize(leftstr.replace('\n', ' '))]
                sent_offset = len(sentences) - 1
                tok_char_end_offset = len(sentences[-1])  # offset after the last char of head
                tok_char_offset = tok_char_end_offset - len(headstr)

                sats_attr = headelem.get("sats")
                rightstr = ''
                if headelem.tail is not None:
                    rightstr += headelem.tail
                lexel_sat = None
                if sats_attr is not None:
                    lexel_sat = sats_attr.split(' ')[0].split('.')[0] + '.' + lexel.split('.')[-1]
                    for i in range(1, len(context)):
                        rightstr += context[i].text
                        rightstr += context[i].tail
                righttoks = [self.remove_control_characters(s) for s in self.sent_segmenter.tokenize(rightstr.replace('\n', ' '))]
                if len(righttoks) > 0:
                    sentences[-1] += righttoks[0]
                    sentences += righttoks[1:]

                # tokenize sentences
                tok_sentences, char2toks = zip(*[self.tokenize(sen) for sen in sentences])

                # create instances
                left_instance = []
                left_begin = max(0, sent_offset - n_surrounding) if n_surrounding >= 0 else 0
                for i in range(left_begin, sent_offset):
                    left_instance += [(t, '#', '#') for t in tok_sentences[i]] + [("[SEP]", '#', '#')]

                head_instance = []
                tok_offset = char2toks[sent_offset][tok_char_offset]
                tok_end_offset = char2toks[sent_offset][tok_char_end_offset-1]

                head_instance += [(t, '#', '#') for t in tok_sentences[sent_offset][:tok_offset]]
                lexel = lexel_sat if lexel_sat is not None else lexel_orig
                head_instance.append((tok_sentences[sent_offset][tok_offset], lexel, iid))
                head_instance += [(t, '*', '*') for t in tok_sentences[sent_offset][tok_offset+1:tok_end_offset+1]]
                head_instance += [(t, '#', '#') for t in tok_sentences[sent_offset][tok_end_offset+1:]]

                right_instance = []
                right_end = min(sent_offset + n_surrounding + 1, len(sentences))
                for i in range(sent_offset + 1, right_end):
                    right_instance += [("[SEP]", '#', '#')] + [(t, '#', '#') for t in tok_sentences[i]]

                self.sent_instances.append([left_instance, head_instance, right_instance])
    
    @classmethod
    def record_keys(cls, fpath):
        keys = {}
        with open_file(fpath, 'r') as f:
            for line in f:
                l = line.strip()
                toks = l.split(' ')
                iid = toks[1]
                key_strs = [t for t in toks[2:] if (t != 'P' and t != 'U')]
                key_str = 'U' if len(key_strs) == 0 else key_strs[0]
                keys[iid] = key_str
            f.close()
        return keys

    @classmethod
    def write_output(cls, iid, sense):
        return "{0} {1} {2}".format(iid.split('.')[0], iid, sense)

    def remove_control_characters(self, s):
        return ''.join(ch for ch in s if category(ch)[0]!='C' or category(ch) == 'Cf')

    def tokenize(self, sentence):
        tokspans = self.tokenizer.span_tokenize(sentence)
        char2tok = {}
        tokens = []
        for i, (s, e) in enumerate(tokspans):
            tokens.append(sentence[s:e])
            for j in range(s, e):
                char2tok[j] = i
        return (tokens, char2tok)