def preprocess(input_file, START_SNLP=True, INPUT_AMR='amr'): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR == 'amr': # the input file is amr annotation amr_file = input_file aligned_amr_file = amr_file + '.amr.tok.aligned' if os.path.exists(aligned_amr_file): comments, amr_strings = readAMR(aligned_amr_file) else: comments, amr_strings = readAMR(amr_file) sentences = [c['snt'] for c in comments] # here should be 'snt' # write sentences(separate per line) tmp_sent_filename = amr_file + '.sent' if not os.path.exists(tmp_sent_filename): # no cache found _write_sentences(tmp_sent_filename, sentences) tmp_prp_filename = tmp_sent_filename + '.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP..." proc1.setup() print >> log, 'Read token,lemma,name entity file %s...' % ( tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename + '.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename, instances) tok_amr_filename = amr_file + '.amr.tok' if not os.path.exists(tok_amr_filename): # write tokenized amr file _write_tok_amr(tok_amr_filename, amr_file, instances) SpanGraph.graphID = 0 for i in xrange(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment, s2c_alignment = Aligner.readJAMRAlignment( amr, comments[i]['alignments']) # use verbalization list to fix the unaligned tokens if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr, alignment, s2c_alignment, instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addComment(comments[i]) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) elif INPUT_AMR == 'amreval': eval_file = input_file comments = readAMREval(eval_file) sentences = [c['snt'] for c in comments] # write sentences(separate per line) tmp_sent_filename = eval_file + '.sent' if not os.path.exists(tmp_sent_filename): # no cache found _write_sentences(tmp_sent_filename, sentences) tmp_prp_filename = tmp_sent_filename + '.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP ..." proc1.setup() instances = proc1.parse(tmp_sent_filename) elif os.path.exists(tmp_prp_filename): # found cache file print >> log, 'Read token,lemma,name entity file %s...' % ( tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) else: raise Exception( 'No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename)) tok_sent_filename = tmp_sent_filename + '.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename, instances) for i in xrange(len(instances)): instances[i].addComment(comments[i]) else: # input file is sentence tmp_sent_filename = input_file tmp_prp_filename = tmp_sent_filename + '.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP ..." proc1.setup() instances = proc1.parse(tmp_sent_filename) elif os.path.exists(tmp_prp_filename): # found cache file print >> log, 'Read token,lemma,name entity file %s...' % ( tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) else: raise Exception( 'No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename)) tok_sent_filename = tmp_sent_filename + '.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename, instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename + '.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = codecs.open(dep_filename, 'w', encoding='utf-8') output_dep.write(dep_result) output_dep.close() _add_dependency(instances, dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename + '.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": if constants.FLAG_ONTO == 'onto': dep_filename = tok_sent_filename + '.charniak.onto.parse.dep' elif constants.FLAG_ONTO == 'onto+bolt': dep_filename = tok_sent_filename + '.charniak.onto+bolt.parse.dep' else: dep_filename = tok_sent_filename + '.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read() _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename + '.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename, 'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename + '.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename, 'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename + '.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename, 'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) else: #pass raise Exception('Unknown dependency parse type %s' % (constants.FLAG_DEPPARSER)) if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances, prop_filename, dep_filename, FIX_PROP_HEAD=True) else: _add_prop(instances, prop_filename, dep_filename) else: raise IOError('Semantic role labeling file %s not found!' % (prop_filename)) if constants.FLAG_RNE: print >> log, "Using rich name entity instead..." rne_filename = tok_sent_filename + '.rne' if os.path.exists(rne_filename): _substitute_rne(instances, rne_filename) else: raise IOError('Rich name entity file %s not found!' % (rne_filename)) return instances
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR: # the input file is amr annotation amr_file = input_file aligned_amr_file = amr_file + '.amr.tok.aligned' if os.path.exists(aligned_amr_file): comments,amr_strings = readAMR(aligned_amr_file) else: comments,amr_strings = readAMR(amr_file) sentences = [c['snt'] for c in comments] # here should be 'snt' tmp_sent_filename = amr_file+'.sent' if not os.path.exists(tmp_sent_filename): # write sentences into file _write_sentences(tmp_sent_filename,sentences) print >> log, "Start Stanford CoreNLP..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) tok_amr_filename = amr_file + '.amr.tok' if not os.path.exists(tok_amr_filename): # write tokenized amr file _write_tok_amr(tok_amr_filename,amr_file,instances) SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addComment(comments[i]) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) else: # input file is sentence tmp_sent_filename = input_file print >> log, "Start Stanford CoreNLP ..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename+'.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = open(dep_filename,'w') output_dep.write(dep_result) output_dep.close() _add_dependency(instances,dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename+'.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename+'.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename+'.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename+'.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) else: pass if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) else: _add_prop(instances,prop_filename,dep_filename) else: raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename)) return instances