def parsed_sentences(input_path): with codecs.open(input_path, 'r', 'utf8') as f: text = f.read() loader = ParsedSentencesLoader() sentences = loader.load(text) parsed = [] i = 0 for sentence in sentences['sentences']: i += 1 parsed.append(StanfordParseLoader._process_parse_result(sentence)) return parsed
def parse_text(sentences): loader = ParsedSentencesLoader() parseResult = loader.load(sentences) if len(parseResult['sentences']) == 1: return parseResult wordOffset = 0 for i in range(len(parseResult['sentences'])): if i > 0: for j in range(len(parseResult['sentences'][i]['dependencies'])): for k in range(1,3): tokens = parseResult['sentences'][i]['dependencies'][j][k].split('-') if tokens[0] == 'ROOT': newWordIndex = 0 else: if not tokens[len(tokens)-1].isdigit(): # forced to do this because of entries like u"lost-8'" in parseResult continue newWordIndex = int(tokens[len(tokens)-1])+wordOffset if len(tokens) == 2: parseResult['sentences'][i]['dependencies'][j][k] = tokens[0]+ '-' + str(newWordIndex) else: w = '' for l in range(len(tokens)-1): w += tokens[l] if l < len(tokens)-2: w += '-' parseResult['sentences'][i]['dependencies'][j][k] = w + '-' + str(newWordIndex) wordOffset += len(parseResult['sentences'][i]['words']) # merge information of all sentences into one for i in range(1,len(parseResult['sentences'])): parseResult['sentences'][0]['text'] += ' ' + parseResult['sentences'][i]['text'] for jtem in parseResult['sentences'][i]['dependencies']: parseResult['sentences'][0]['dependencies'].append(jtem) for jtem in parseResult['sentences'][i]['words']: parseResult['sentences'][0]['words'].append(jtem) # remove all but the first entry parseResult['sentences'] = parseResult['sentences'][0:1] return parseResult
import codecs import sys from utils.parsed_sentences_loader import ParsedSentencesLoader from utils.stanford_format import StanfordParseLoader from utils.conll_format import CONNL from alignment.context_evidence import ContextEvidence with codecs.open('data_test/test.parse', 'r', 'utf8') as f: text = f.read() loader = ParsedSentencesLoader() sentences = loader.load(text) parsed = [] for sentence in sentences['sentences']: parsed.append(StanfordParseLoader.process_parse_result(sentence)) sys.exit() parsed = CONNL.load('/home/marina/workspace/data/TRJuly/txtfile.output.tok.parse') print(str(len(parsed))) with codecs.open('data_test/test.parse.out', 'w', 'utf8') as o: for i, sentence in enumerate(parsed): o.write('Sentence: {}\n'.format(i + 1)) for word in sentence: o.write('{}\t{}\t{}\t{}\n'.format(word.index, word.form, word.dep, -1 if word.head is None else word.head.index)) o.write('\n') o.close()