示例#1
0
    def parsed_sentences(input_path):
        with codecs.open(input_path, 'r', 'utf8') as f:
            text = f.read()

        loader = ParsedSentencesLoader()
        sentences = loader.load(text)
        parsed = []
        i = 0
        for sentence in sentences['sentences']:
            i += 1
            parsed.append(StanfordParseLoader._process_parse_result(sentence))
        return parsed
示例#2
0
def parse_text(sentences):

    loader = ParsedSentencesLoader()
    parseResult = loader.load(sentences)

    if len(parseResult['sentences']) == 1:
        return parseResult

    wordOffset = 0

    for i in range(len(parseResult['sentences'])):

        if i > 0:
            for j in range(len(parseResult['sentences'][i]['dependencies'])):

                for k in range(1,3):
                    tokens = parseResult['sentences'][i]['dependencies'][j][k].split('-')
                    if tokens[0] == 'ROOT':
                        newWordIndex = 0
                    else:
                        if not tokens[len(tokens)-1].isdigit(): # forced to do this because of entries like u"lost-8'" in parseResult
                            continue
                        newWordIndex = int(tokens[len(tokens)-1])+wordOffset
                    if len(tokens) == 2:
                        parseResult['sentences'][i]['dependencies'][j][k] = tokens[0]+ '-' + str(newWordIndex)
                    else:
                        w = ''
                        for l in range(len(tokens)-1):
                            w += tokens[l]
                            if l < len(tokens)-2:
                                w += '-'
                        parseResult['sentences'][i]['dependencies'][j][k] = w + '-' + str(newWordIndex)

        wordOffset += len(parseResult['sentences'][i]['words'])


    # merge information of all sentences into one
    for i in range(1,len(parseResult['sentences'])):
        parseResult['sentences'][0]['text'] += ' ' + parseResult['sentences'][i]['text']
        for jtem in parseResult['sentences'][i]['dependencies']:
            parseResult['sentences'][0]['dependencies'].append(jtem)
        for jtem in parseResult['sentences'][i]['words']:
            parseResult['sentences'][0]['words'].append(jtem)

    # remove all but the first entry
    parseResult['sentences'] = parseResult['sentences'][0:1]

    return parseResult
示例#3
0
import codecs
import sys

from utils.parsed_sentences_loader import ParsedSentencesLoader
from utils.stanford_format import StanfordParseLoader
from utils.conll_format import CONNL
from alignment.context_evidence import ContextEvidence

with codecs.open('data_test/test.parse', 'r', 'utf8') as f:
    text = f.read()

loader = ParsedSentencesLoader()
sentences = loader.load(text)
parsed = []

for sentence in sentences['sentences']:
    parsed.append(StanfordParseLoader.process_parse_result(sentence))


sys.exit()
parsed = CONNL.load('/home/marina/workspace/data/TRJuly/txtfile.output.tok.parse')
print(str(len(parsed)))

with codecs.open('data_test/test.parse.out', 'w', 'utf8') as o:
    for i, sentence in enumerate(parsed):
        o.write('Sentence: {}\n'.format(i + 1))
        for word in sentence:
            o.write('{}\t{}\t{}\t{}\n'.format(word.index, word.form, word.dep, -1 if word.head is None else word.head.index))
        o.write('\n')
o.close()