예제 #1
0
def prepareSentence2(sentence):
    sentenceParseResult = parse_text(sentence)

    sentenceLemmatized = lemmatize(sentenceParseResult)

    sentencePosTagged = posTag(sentenceParseResult)

    sentenceLemmasAndPosTags = []

    for i in range(len(sentenceLemmatized)):
        sentenceLemmasAndPosTags.append([])

    for i in range(len(sentenceLemmatized)):
        for item in sentenceLemmatized[i]:
            sentenceLemmasAndPosTags[i].append(item)
        sentenceLemmasAndPosTags[i].append(sentencePosTagged[i][3])

    words = []

    for rawWord in sentenceLemmasAndPosTags:
        word = Word(rawWord[1] - 1, rawWord[2])
        word.lemma = rawWord[3]
        word.pos = rawWord[4]
        words.append(word)

    return words
예제 #2
0
 def _get_words(raw_sentence):
     words = []
     for i, item in enumerate(raw_sentence['words']):
         word = Word(i + 1, item[0])
         word.lemma = item[1]['Lemma']
         word.pos = item[1]['PartOfSpeech'].lower()
         word.ner = item[1]['NamedEntityTag']
         words.append(word)
     return words
예제 #3
0
    def load(path_input):
        sentences = []
        sentence = []
        with codecs.open(path_input,'r', 'utf8') as f:
            lines = f.readlines()

        for line in lines:
            if line == '\n':
                sentences.append(sentence)
                sentence = []
                continue

            parts = line.strip().split('\t')
            word = Word(parts[0], parts[1]) # punctuation head is root
            word.lemma = parts[2]
            word.pos = parts[4]
            word.dep = parts[7]
            word.head = parts[6]
            sentence.append(word)

        sentences.append(sentence)
        return sentences