Exemplo n.º 1
0
    def process(self, document):
        assert(isinstance(document, Document))

        tokenizedSentences = []
        taggedSentences = []
        parsedSentences = []
        for sentence in document.rawSentences:
            # shorten word longer than
            words = sentence.split()
            words = [w[:MAX_WORDLEN] for w in words]
            sent = ''
            for w in words:
                sent += w + ' '

            #print repr(sentence)
            #print repr(sent)
            #print
            sentence = sent

            tokenized = []
            tokenizedTagged = []
            parsed = []

            if type(sentence) == unicode:
                geniaResult = geniatagger.tag_sentence(sentence.encode('utf-8'))
            else:
                geniaResult = geniatagger.tag_sentence(sentence)

            for wordTags in geniaResult:
                word, base, POStag, chunktag, NEtag = wordTags[0], wordTags[1], wordTags[2], wordTags[3], wordTags[4]
                tokenized.append(word)
                tokenizedTagged.append((word, POStag))
                parsed.append((word, POStag, chunktag))
            #end
            tokenizedSentences.append(tokenized)
            taggedSentences.append(tokenizedTagged)
            #parsedSentences.append(nltk.chunk.util.conlltags2tree(parsed))
            parsedSentences.append(parsed)
        #end
        document.tokenizedSentences = tokenizedSentences
        document.taggedSentences = taggedSentences
        document.parsedSentences = parsedSentences
Exemplo n.º 2
0
 def processSentence(self, sentence):
     tokenized = []
     tokenizedTagged = []
     parsed = []
     geniaResult = geniatagger.tag_sentence(sentence)
     for wordTags in geniaResult:
         word, base, POStag, chunktag, NEtag = wordTags[0], wordTags[1], wordTags[2], wordTags[3], wordTags[4]
         tokenized.append(word)
         tokenizedTagged.append((word, POStag))
         parsed.append((word, POStag, chunktag))
     #end
     #return tokenized, tokenizedTagged, nltk.chunk.util.conlltags2tree(parsed)
     return tokenized, tokenizedTagged, parsed