Пример #1
0
def preprocess(input_file, START_SNLP=True, INPUT_AMR='amr'):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None

    if INPUT_AMR == 'amr':  # the input file is amr annotation

        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments, amr_strings = readAMR(aligned_amr_file)
        else:
            comments, amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments]  # here should be 'snt'

        # write sentences(separate per line)
        tmp_sent_filename = amr_file + '.sent'
        if not os.path.exists(tmp_sent_filename):  # no cache found
            _write_sentences(tmp_sent_filename, sentences)

        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP..."
            proc1.setup()

        print >> log, 'Read token,lemma,name entity file %s...' % (
            tmp_prp_filename)
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename):  # write tokenized amr file
            _write_tok_amr(tok_amr_filename, amr_file, instances)

        SpanGraph.graphID = 0
        for i in xrange(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment, s2c_alignment = Aligner.readJAMRAlignment(
                    amr, comments[i]['alignments'])
                # use verbalization list to fix the unaligned tokens
                if constants.FLAG_VERB:
                    Aligner.postProcessVerbList(amr, comments[i]['tok'],
                                                alignment)
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr, alignment,
                                                      s2c_alignment,
                                                      instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    elif INPUT_AMR == 'amreval':
        eval_file = input_file
        comments = readAMREval(eval_file)
        sentences = [c['snt'] for c in comments]

        # write sentences(separate per line)
        tmp_sent_filename = eval_file + '.sent'
        if not os.path.exists(tmp_sent_filename):  # no cache found
            _write_sentences(tmp_sent_filename, sentences)

        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename):  # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (
                tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception(
                'No cache file %s has been found. set START_SNLP=True to start corenlp.'
                % (tmp_prp_filename))

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

        for i in xrange(len(instances)):
            instances[i].addComment(comments[i])

    else:  # input file is sentence
        tmp_sent_filename = input_file
        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename):  # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (
                tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception(
                'No cache file %s has been found. set START_SNLP=True to start corenlp.'
                % (tmp_prp_filename))

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

    # preprocess 2: dependency parsing
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename + '.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = codecs.open(dep_filename, 'r',
                                     encoding='utf-8').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = codecs.open(dep_filename, 'w', encoding='utf-8')
            output_dep.write(dep_result)
            output_dep.close()

        _add_dependency(instances, dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename + '.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = codecs.open(dep_filename, 'r',
                                     encoding='utf-8').read()
        else:
            raise IOError('Converted dependency file %s not founded' %
                          (dep_filename))

        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        if constants.FLAG_ONTO == 'onto':
            dep_filename = tok_sent_filename + '.charniak.onto.parse.dep'
        elif constants.FLAG_ONTO == 'onto+bolt':
            dep_filename = tok_sent_filename + '.charniak.onto+bolt.parse.dep'
        else:
            dep_filename = tok_sent_filename + '.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read()
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename + '.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename + '.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename + '.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)
    else:
        #pass
        raise Exception('Unknown dependency parse type %s' %
                        (constants.FLAG_DEPPARSER))

    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,
                          prop_filename,
                          dep_filename,
                          FIX_PROP_HEAD=True)
            else:
                _add_prop(instances, prop_filename, dep_filename)

        else:
            raise IOError('Semantic role labeling file %s not found!' %
                          (prop_filename))

    if constants.FLAG_RNE:
        print >> log, "Using rich name entity instead..."
        rne_filename = tok_sent_filename + '.rne'
        if os.path.exists(rne_filename):
            _substitute_rne(instances, rne_filename)
        else:
            raise IOError('Rich name entity file %s not found!' %
                          (rne_filename))

    return instances
Пример #2
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)


        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances