Пример #1
0
    def test_document(self):
        """
    test basic use of Tipster's Document class, detailed tests TODO
    """
        # we are using naive text sample here, and a primitive PunktSentenceTokenizer to split the sentences
        # more advanced and practical sentence splitter is needed to handle abbreviations
        sample_txt = 'This is a sample text. It has two sentences.'
        true_toks = 'This is a sample text . It has two sentences .'.split()

        # add sentence annotations
        doc = Document(sample_txt)
        senttokenizer = PunktSentenceTokenizer()
        sents = senttokenizer.tokenize(sample_txt)
        sent_spans = senttokenizer.span_tokenize(sample_txt)
        for sent, sent_span in zip(sents, sent_spans):
            doc.annotate(sent_span[0], sent_span[1], 'sentence')

        # add token annotations
        treebanktokenizer = TreebankWordTokenizer()
        # use sentence annotation to retrieve sentences
        for sent in doc.get_annotations_by_type('sentence'):
            toks = treebanktokenizer.tokenize(doc.get_text_by_ann(sent))
            spans = treebanktokenizer.span_tokenize(doc.get_text_by_ann(sent))
            sent_base = sent.span.start
            for tok, tok_span in zip(toks, spans):
                span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base)
                doc.annotate(span.start, span.end, 'token')

        # check if all tokens are correct
        for tok, true_tok in zip(doc.get_annotations_by_type('token'),
                                 true_toks):
            self.assertTrue(doc.get_text_by_ann(tok) == true_tok)
Пример #2
0
def sentence_tokenize(text: str, include_spans: bool = False) -> List:
    """
    Args:
        text (str): a unicode string
    Returns:
        list: tokenized sentence. If include_spans is True, then each item is a Span object  with both text and a start_index. Otherwise, only text is returned.
    """
    lines = re.split('\n', text)
    #lines = list(filter(None, sentences))
    sentences = []

    tokenizer = PunktSentenceTokenizer()
    total_offset = 0
    for line in lines:
        line_start = total_offset
        for sentence_start, sentence_end in tokenizer.span_tokenize(line):
            sentence = line[sentence_start:sentence_end]
            if include_spans:
                sentences.append(
                    Span(text=sentence,
                         start_index=line_start + sentence_start))
            else:
                sentences.append(sentence)
            total_offset = line_start + sentence_end
        total_offset += 1
    return sentences
Пример #3
0
class Parse(Service):
    def __init__(self, langs=[]):
        Service.__init__(self, 'parse', 'nltk', [])
        self.punktSentenceTokenizer = PunktSentenceTokenizer()
        self.treebankWordTokenizer = TreebankWordTokenizer()
        #PunkSentence, TreebankWordTokenizer

    def run(self, request, response):
        if request['lang'] == 'en':
            text = request['text']
            debug = request.get('debug', False)
            result = []
            for sent_s, sent_e in self.punktSentenceTokenizer.span_tokenize(
                    text):
                tokens = []
                sentence = text[sent_s:sent_e]
                for token_s, token_e in self.treebankWordTokenizer.span_tokenize(
                        sentence):
                    item = {'start': token_s + sent_s, 'end': token_e + sent_s}
                    if debug:
                        item['text'] = sentence[token_s:token_e]
                    tokens.append(item)
                result.append(tokens)
            return result
        else:
            raise MissingLanguage(request['lang'])

    def describe(self):
        result = super().describe()
        result['langs'] = ['en']
        result['models'] = {'en': {'pretrained': True}}
        return result
Пример #4
0
class SimpleTokenizer(BaseTokenizer):
    def __init__(self, doc, tok_anno_name='token', sent_anno_name='sentence'):
        self.doc = doc
        self.tok_anno_name = tok_anno_name
        super(SimpleTokenizer, self).__init__(self.doc, self.tok_anno_name)
        self.sent_anno_name = sent_anno_name
        self.sentTokenizer = PunktSentenceTokenizer()
        self.treebankTokenizer = TreebankWordTokenizer()

    def tokenize(self, start_pos=0, text=None):
        if text is None:  # if text not given, assume it spans to the end of doc
            text = self.doc.get_text_by_span(
                Span(start_pos, self.doc.get_length()))

        # add sentence annotations
        sents = self.sentTokenizer.tokenize(text)
        sent_spans = self.sentTokenizer.span_tokenize(text)
        for sent, sent_span in zip(sents, sent_spans):
            self.doc.annotate(start_pos + sent_span[0],
                              start_pos + sent_span[1], self.sent_anno_name)

        # use sentence annotation to retrieve sentences
        for sent in self.doc.get_annotations_by_type(self.sent_anno_name):
            toks = self.treebankTokenizer.tokenize(
                self.doc.get_text_by_ann(sent))
            spans = self.treebankTokenizer.span_tokenize(
                self.doc.get_text_by_ann(sent))
            sent_base = sent.span.start
            for tok, tok_span in zip(toks, spans):
                span = Span(tok_span[0] + sent_base, tok_span[1] + sent_base)
                self.doc.annotate(span.start, span.end, self.tok_anno_name)
Пример #5
0
 def __get_setences_boundaries(self):
     """
     function to tokenize sentences and return
     sentence boundaries of each sentence using a tokenizer.
     :return:
     """
     tokenizer = PunktSentenceTokenizer()
     sentences = list(tokenizer.span_tokenize(self.text))
     return sentences
Пример #6
0
class DefaultSentenceTokenizer(object):
    def __init__(self):
        self.tokenizer = PunktSentenceTokenizer()

    def tokenize_sents(self, text):
        """
        Returns spans
        """
        return self.tokenizer.span_tokenize(text)
Пример #7
0
def nltkSentenceSplit(document):
    sentenceTokenizer = PunktSentenceTokenizer()
    sentences = sentenceTokenizer.span_tokenize(document)
    sentenceDict = dict()

    for beginSpan, endSpan in sentences:
        # print(beginSpan, endSpan, document[beginSpan:endSpan])
        sentenceDict[beginSpan] = document[beginSpan:endSpan]

    return sentenceDict
Пример #8
0
class SpanTokenizer:
    """
    基于NLTK工具包,自定义一个详细版本的Tokenizer
    """
    def __init__(self):
        self.tokenizer_big = PunktSentenceTokenizer()
        self.tokenizer_small = TreebankWordTokenizer()

    def tokenize(self, text):
        result = []
        sentences_span = self.tokenizer_big.span_tokenize(text)
        for start, end in sentences_span:
            sentence = text[start:end]
            tokens_span = self.tokenizer_small.span_tokenize(sentence)
            for token_start, token_end in tokens_span:
                result.append([start + token_start, start + token_end])
        return result
Пример #9
0
def main():
    conn = get_connection(UNICODE=True)
    curr = conn.cursor()
    tokenizer = TreebankWordTokenizer()

    while True:
        curr.execute("""SELECT id, text, language FROM documents 
                WHERE
                --guid='tw:122144569302323201'
                EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
                LIMIT 1""")
        data = curr.fetchone()
        if data is None:
            print "sleep"
            timer.sleep_minute(30)
            continue
        id, text, lang = data
        print "id", id
        curr.execute("""SELECT * FROM instances
                WHERE item_id = %s
                AND begintoken IS NULL""", (id,))
        # throw away `confidence`
        instances = [list(x)[:-1] for x in curr]
        if not len(instances):
            continue
        instance_ = []
        for ins in instances:
            ins[-1] = None
            ins[-2] = None
            ins[-3] = None
            instance_.append(ins)
        instances = instance_
        #print instances

        sent_tok = PunktSentenceTokenizer()

        for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
            #print '++++'
            sentence = text[sentidx[0]:sentidx[1]]
            #print sentence
            #print '----'
            for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)):
                # TODO indexy jsou pouze relativni k vete
                # ale instances je ma od zacatku!
                indexes = list(indexes)
                indexes[0] = sentidx[0] + indexes[0]
                indexes[1] = sentidx[0] + indexes[1]
                word = text[indexes[0]:indexes[1]]
                #print pos, word, indexes

                for i, instance in enumerate(instances):
                    id, entity_id, item_id, exact, offset, length, sid_, begin, end  =instance
                    #print i,instance
                    if sid_ is None:
                        if begin is None:
                            if offset >= indexes[0] and offset <= indexes[1]:
                                instances[i][-2] = begin = pos
                                instances[i][-3] = sid_ = sid
                    if sid_ == sid:
                        if end is None and begin is not None:
                            off = offset + length
                            if off <= indexes[1] and off >= indexes[0]:
                                instances[i][-1] = pos
                                if off == indexes[0]:
                                    instances[i][-1] = pos - 1
        for instance in instances:
            print instance
            id, entity_id, item_id, exact, offset, length, sid, begin, end =instance
            #print exact, ">>", sid, begin, end
            if end is None:
                if not " " in exact:
                    end = begin
                else:
                    end = -1
            curr.execute("""UPDATE instances
                    SET sid=%s, begintoken=%s, endtoken=%s
                    WHERE id=%s""", (sid, begin, end, id))
def main():
    conn = get_connection(UNICODE=True)
    curr = conn.cursor()
    tokenizer = TreebankWordTokenizer()

    while True:
        curr.execute("""SELECT id, text, language FROM documents 
                WHERE
                --guid='tw:122144569302323201'
                EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
                LIMIT 1""")
        data = curr.fetchone()
        if data is None:
            print "sleep"
            timer.sleep_minute(30)
            continue
        id, text, lang = data
        print "id", id
        curr.execute(
            """SELECT * FROM instances
                WHERE item_id = %s
                AND begintoken IS NULL""", (id, ))
        # throw away `confidence`
        instances = [list(x)[:-1] for x in curr]
        if not len(instances):
            continue
        instance_ = []
        for ins in instances:
            ins[-1] = None
            ins[-2] = None
            ins[-3] = None
            instance_.append(ins)
        instances = instance_
        #print instances

        sent_tok = PunktSentenceTokenizer()

        for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
            #print '++++'
            sentence = text[sentidx[0]:sentidx[1]]
            #print sentence
            #print '----'
            for pos, indexes in enumerate(
                    WhitespaceTokenizer().span_tokenize(sentence)):
                # TODO indexy jsou pouze relativni k vete
                # ale instances je ma od zacatku!
                indexes = list(indexes)
                indexes[0] = sentidx[0] + indexes[0]
                indexes[1] = sentidx[0] + indexes[1]
                word = text[indexes[0]:indexes[1]]
                #print pos, word, indexes

                for i, instance in enumerate(instances):
                    id, entity_id, item_id, exact, offset, length, sid_, begin, end = instance
                    #print i,instance
                    if sid_ is None:
                        if begin is None:
                            if offset >= indexes[0] and offset <= indexes[1]:
                                instances[i][-2] = begin = pos
                                instances[i][-3] = sid_ = sid
                    if sid_ == sid:
                        if end is None and begin is not None:
                            off = offset + length
                            if off <= indexes[1] and off >= indexes[0]:
                                instances[i][-1] = pos
                                if off == indexes[0]:
                                    instances[i][-1] = pos - 1
        for instance in instances:
            print instance
            id, entity_id, item_id, exact, offset, length, sid, begin, end = instance
            #print exact, ">>", sid, begin, end
            if end is None:
                if not " " in exact:
                    end = begin
                else:
                    end = -1
            curr.execute(
                """UPDATE instances
                    SET sid=%s, begintoken=%s, endtoken=%s
                    WHERE id=%s""", (sid, begin, end, id))