def parse_parser_results(text): """ This is the nasty bit of code to interact with the command-line interface of the CoreNLP tools. Takes a string of the parser results and then returns a Python list of dictionaries, one for each parsed sentence. """ data = Data() state = STATE_START #for line in re.split("\r\n(?![^\[]*\])",text): for line in re.split("\r\n", text): line = line.strip() if line == 'NLP>': break if line.startswith("Sentence #"): state = STATE_TEXT elif state == STATE_TEXT: Data.newSen() data.addText(line) state = STATE_WORDS elif state == STATE_WORDS: if len(line) == 0: continue if not line.startswith("[Text="): raise Exception('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): t = parse_bracketed(s) if t[0] == '': continue data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'], t[1][u'PartOfSpeech'], t[1][u'NamedEntityTag']) state = STATE_TREE parsed = [] elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY parsed = " ".join(parsed) #data.addTree(Tree.parse(parsed)) else: parsed.append(line) elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE else: pass ''' # don't need here split_entry = re.split("\(|, ", line[:-1]) if len(split_entry) == 3: rel, l_lemma, r_lemma = split_entry m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma) l_lemma, l_index = m.group('lemma'), m.group('index') m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma) r_lemma, r_index = m.group('lemma'), m.group('index') data.addDependency( rel, l_lemma, r_lemma, l_index, r_index) ''' elif state == STATE_COREFERENCE: if "Coreference set" in line: ## if 'coref' not in results: ## results['coref'] = [] coref_set = [] data.addCoref(coref_set) else: for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall( line): src_i, src_pos, src_l, src_r = int(src_i), int( src_pos), int(src_l), int(src_r) sink_i, sink_pos, sink_l, sink_r = int(sink_i), int( sink_pos), int(sink_l), int(sink_r) coref_set.append( ((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) return data
def parse_parser_results_new(text): """ This is the nasty bit of code to interact with the command-line interface of the CoreNLP tools. Takes a string of the parser results and then returns a Python list of dictionaries, one for each parsed sentence. updated for newer version of stanford corenlp -- 2015 """ data_list = [] data = None lastline = None following_line = None state = STATE_START #for line in re.split("\r\n(?![^\[]*\])",text): seqs = re.split("\r\n", text) i = 0 #for line in re.split("\r\n", text): while i < len(seqs): line = seqs[i] line = line.strip() if line.startswith('NLP>'): # end if data: data_list.append(data) # add last one break if line.startswith("Sentence #"): if data: data_list.append(data) data = Data() if SENTENCE_NO_PATTERN.match(line): state = STATE_TEXT else: lastline = line state = STATE_SENT_ERROR i += 1 elif state == STATE_SENT_ERROR: line = lastline + line assert SENTENCE_NO_PATTERN.match(line) is not None state = STATE_TEXT i += 1 elif state == STATE_TEXT_ERROR: line = line + following_line data.addText(line) state = STATE_WORDS i += 2 elif state == STATE_TEXT: Data.newSen() data.addText(line) state = STATE_WORDS i += 1 elif state == STATE_WORDS: if len(line) == 0: continue if not line.startswith("[Text="): #raise Exception('Parse error. Could not find "[Text=" in: %s' % line) print >> sys.stderr, 'Parse error. Could not find "[Text=" in: %s' % line print >> sys.stderr, 'Attempt to fixing error.' following_line = line state = STATE_TEXT_ERROR i -= 1 continue #for s in WORD_PATTERN.findall(line): wline = line while WORD_PATTERN.match(wline): t = parse_bracketed(wline[1:-1]) if t[0] == '': i += 1 wline = seqs[i] continue data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'], t[1][u'PartOfSpeech'], t[1][u'NamedEntityTag']) i += 1 wline = seqs[i] if WORD_ERROR_PATTERN.match(wline): # handle format error wline = wline + seqs[i + 1] wline = wline.strip() t = parse_bracketed(wline[1:-1]) data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'], t[1][u'PartOfSpeech'], t[1][u'NamedEntityTag']) i += 2 state = STATE_WORDS continue state = STATE_TREE parsed = [] elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY parsed = " ".join(parsed) i += 1 #data.addTree(Tree.parse(parsed)) else: parsed.append(line) i += 1 elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE else: pass ''' # don't need here split_entry = re.split("\(|, ", line[:-1]) if len(split_entry) == 3: rel, l_lemma, r_lemma = split_entry m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma) l_lemma, l_index = m.group('lemma'), m.group('index') m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma) r_lemma, r_index = m.group('lemma'), m.group('index') data.addDependency( rel, l_lemma, r_lemma, l_index, r_index) ''' i += 1 elif state == STATE_COREFERENCE: if "Coreference set" in line: #if 'coref' not in results: # results['coref'] = [] coref_set = [] data.addCoref(coref_set) else: for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall( line): src_i, src_pos, src_l, src_r = int(src_i), int( src_pos), int(src_l), int(src_r) sink_i, sink_pos, sink_l, sink_r = int(sink_i), int( sink_pos), int(sink_l), int(sink_r) coref_set.append( ((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) i += 1 else: i += 1 return data_list
def parse_parser_results(text): """ This is the nasty bit of code to interact with the command-line interface of the CoreNLP tools. Takes a string of the parser results and then returns a Python list of dictionaries, one for each parsed sentence. """ data = Data() state = STATE_START for line in re.split("\r\n(?!=)",text): line = line.strip() if line == 'NLP>': break if line.startswith("Sentence #"): state = STATE_TEXT elif state == STATE_TEXT: Data.newSen() data.addText(line) state = STATE_WORDS elif state == STATE_WORDS: if len(line) == 0: continue if not line.startswith("[Text="): raise Exception('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): t = parse_bracketed(s) data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'],t[1][u'PartOfSpeech'],t[1][u'NamedEntityTag']) state = STATE_TREE parsed = [] elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY parsed = " ".join(parsed) #data.addTree(Tree.parse(parsed)) else: parsed.append(line) elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE else: pass ''' # don't need here split_entry = re.split("\(|, ", line[:-1]) if len(split_entry) == 3: rel, l_lemma, r_lemma = split_entry m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma) l_lemma, l_index = m.group('lemma'), m.group('index') m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma) r_lemma, r_index = m.group('lemma'), m.group('index') data.addDependency( rel, l_lemma, r_lemma, l_index, r_index) ''' elif state == STATE_COREFERENCE: if "Coreference set" in line: ## if 'coref' not in results: ## results['coref'] = [] coref_set = [] data.addCoref(coref_set) else: for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): src_i, src_pos, src_l, src_r = int(src_i), int(src_pos), int(src_l), int(src_r) sink_i, sink_pos, sink_l, sink_r = int(sink_i), int(sink_pos), int(sink_l), int(sink_r) coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) return data
def parse_parser_results_new(text): """ This is the nasty bit of code to interact with the command-line interface of the CoreNLP tools. Takes a string of the parser results and then returns a Python list of dictionaries, one for each parsed sentence. updated for newer version of stanford corenlp -- 2015 """ data_list = [] data = None lastline = None following_line = None state = STATE_START #for line in re.split("\r\n(?![^\[]*\])",text): seqs = re.split("\r\n", text) i = 0 #for line in re.split("\r\n", text): while i < len(seqs): line = seqs[i] line = line.strip() if line.startswith('NLP>'): # end if data: data_list.append(data) # add last one break if line.startswith("Sentence #"): if data: data_list.append(data) data = Data() if SENTENCE_NO_PATTERN.match(line): state = STATE_TEXT else: lastline = line state = STATE_SENT_ERROR i += 1 elif state == STATE_SENT_ERROR: line = lastline + line assert SENTENCE_NO_PATTERN.match(line) is not None state = STATE_TEXT i += 1 elif state == STATE_TEXT_ERROR: line = line + following_line data.addText(line) state = STATE_WORDS i += 2 elif state == STATE_TEXT: Data.newSen() data.addText(line) state = STATE_WORDS i += 1 elif state == STATE_WORDS: if len(line) == 0: continue if not line.startswith("[Text="): #raise Exception('Parse error. Could not find "[Text=" in: %s' % line) print >> sys.stderr, 'Parse error. Could not find "[Text=" in: %s' % line print >> sys.stderr, 'Attempt to fixing error.' following_line = line state = STATE_TEXT_ERROR i -= 1 continue #for s in WORD_PATTERN.findall(line): wline = line while WORD_PATTERN.match(wline): t = parse_bracketed(wline[1:-1]) if t[0] == '': i += 1 wline = seqs[i] continue data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'],t[1][u'PartOfSpeech'],t[1][u'NamedEntityTag']) i += 1 wline = seqs[i] if WORD_ERROR_PATTERN.match(wline): # handle format error wline = wline + seqs[i+1] wline = wline.strip() t = parse_bracketed(wline[1:-1]) data.addToken(t[0], t[1][u'CharacterOffsetBegin'], t[1][u'CharacterOffsetEnd'], t[1][u'Lemma'],t[1][u'PartOfSpeech'],t[1][u'NamedEntityTag']) i+=2 state = STATE_WORDS continue state = STATE_TREE parsed = [] elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY parsed = " ".join(parsed) i += 1 #data.addTree(Tree.parse(parsed)) else: parsed.append(line) i += 1 elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE else: pass ''' # don't need here split_entry = re.split("\(|, ", line[:-1]) if len(split_entry) == 3: rel, l_lemma, r_lemma = split_entry m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma) l_lemma, l_index = m.group('lemma'), m.group('index') m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', r_lemma) r_lemma, r_index = m.group('lemma'), m.group('index') data.addDependency( rel, l_lemma, r_lemma, l_index, r_index) ''' i += 1 elif state == STATE_COREFERENCE: if "Coreference set" in line: #if 'coref' not in results: # results['coref'] = [] coref_set = [] data.addCoref(coref_set) else: for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line): src_i, src_pos, src_l, src_r = int(src_i), int(src_pos), int(src_l), int(src_r) sink_i, sink_pos, sink_l, sink_r = int(sink_i), int(sink_pos), int(sink_l), int(sink_r) coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) i += 1 else: i += 1 return data_list