예제 #1
0
def main():
    parser = argparse.ArgumentParser(description="Run Geoparse Alignment Input Generator")
    parser.add_argument('--input',type=str,required=True,help="Input file of geoparse")
    parser.add_argument('--osent',type=str,required=True,help="Directory where sentence is outputed")
    parser.add_argument('--ologic',type=str,required=True,help="Directory where logical-form is outputed")
    parser.add_argument('--output',type=str,help="Directory where verbosed output is generated")
    parser.add_argument('--manual',type=str)
    args = parser.parse_args()

    if args.manual:
        with open(args.manual) as fp:
            for line in fp:
                a, b = line.strip().split()
                manual_align[a] = b

    linecount = 0 
    inp = open(args.input,"r")
    out_sent = open(args.osent, "w")
    out_sent_g = open(args.osent + ".gin", "w")
    out_log = open(args.ologic,"w")
    out_log_g = open(args.ologic + ".gin", "w")
    out_log_p = open(args.ologic + ".parse", "w")
    out_w = open(args.osent + ".word", "w")
    out = None
    if args.output:
        out = open(args.output, "w")

    #### For every well formed query in file extract the rule!
    for line in inp:
        line = line.strip()

        (sentence_node, query_node) = extract(line,0,"")[0][0].childs

        #### Sentence and node
        sentence = [node.label for node in sentence_node.childs]

        if sentence[-1] == "'.'" or sentence[-1] == "?":
            sentence = sentence[:-1]

        # print_node(sentence_node)
        # print_node(query_node)
        for word in sentence: words.add(word)

        #### logical rule extraction
        var_map = defaultdict(lambda: len(var_map)+1)
        query_node = construct_query_node(query_node,[])
        query_node = change_var_to_x(query_node,var_map)
        rules = transform_into_rule([],query_node,start=True)

        #### Printing
        out_sent.write(" ".join(sentence) + "\n")
        out_sent_g.write(" ".join(sentence) + "\n")

        (logical_rule, logical_rule_giza) = ([str_logical_rule(rule[1],rule[4]) for rule in rules], [str_giza_in_rule(rule) for rule in rules])
        if (len(logical_rule) != len(logical_rule_giza)):
            print >> sys.stderr, "Rule size doesn't match", logical_rule_giza, logical_rule

        out_log.write(" ".join(logical_rule) + "\n")
        out_log_g.write(" ".join(logical_rule_giza)+ "\n")
        out_log_p.write(query_representation(query_node,{value:key for key, value in var_map.items()},input_generator=True) +"\n")

        if args.output:
            out.write(" ".join(sentence) + "\n")
            for rule in rules:
                out.write(str_logical_rule(rule[1],rule[4]) + " ||| " + str_giza_in_rule(rule)+ "\n")
            out.write("------------------------------------\n") 
        linecount += 1

    inp.close()
    out_sent.close()
    out_log.close()

    if args.output:
        out.close()

    #### ADDITIONAL information for alignment
    #### Every word is aligned to itself
    for i in range(0,10):
        for word in sorted(words):
            out_sent_g.write(word + "\n")
            out_log_g.write(word + "\n")
            out_w.write(word +"\n")
        
        for word1, word2 in manual_align.items():
            out_sent_g.write(word1 + "\n")
            out_log_g.write(word2 + "\n")
            out_w.write(word1 + "\n")

    #### Handle something like 'south dakota' so add alignment south -> south_dakota and dakota -> south_dakota
    for literals in many_literals:
        literals = literals.split(' ')
        for word in literals:
            out_sent_g.write(word + "\n")
            out_log_g.write('_'.join(literals) + "\n")

    out_sent_g.close()
    out_log_g.close()

    print >> sys.stderr, "Successfully extracting :",  linecount, "pair(s)."
예제 #2
0
def main():
    args = parse_argument()

    #### file pointer
    inp_file = open(args.input,'r')
    sent_file = open(args.sent,'r')
    fol_file = open(args.fol,'r')
    align_file = open(args.align,'r')

    #### counter
    count = 0

    #### map for validation
    cycle_map = defaultdict(lambda:set())
    
    #fp = open("geoquery.fparse","w")
    #### For input-sentence-fol-alignment
    for (index,(inp_line,sent_line, fol_line, align_line)) in enumerate(zip(inp_file,sent_file,fol_file,align_file)):
        inp_line = inp_line.strip()
        query = extract(inp_line)[0][0]
        (sentence_node, query_node) = query.childs # get the tree representation by extracting the geoquery

        #### strip, split!
        (sent, fol, align_line) = map(lambda x: x.strip().split(), (sent_line, fol_line, align_line)) 

        #### creating mapping from F -> [w1,w2,...] where w1 w2 are words aligned to F(OL)
        s2l = defaultdict(lambda:set())
        for align in align_line:
            (sent_a, fol_a) = map(int, align.split('-'))
            if fol[fol_a] not in s2l:
                s2l[fol[fol_a]] = set()
            s2l[fol[fol_a]].add(sent_a)

        #### Doing some node annotation, and bound node to which word and variable it is aligned to.
        var_map = defaultdict(lambda:len(var_map)+1)
                
        #### Query Node + preprocessing
        id = []
        query_node = construct_query_node(query_node,id)
        query_node = relax_not_node(query_node,id)
        # alter A->x1, B->x2, and so on
        query_node = change_var_to_x(query_node,var_map)
        # change '' to CONJUNCTION and \+ to NEGATION
        query_node = change_conj(query_node)
        
        #### Related to the Word alignment.
        aligned_word = set()
        # give information about which words that are aligned to node
        query_node = calculate_e(query_node,s2l,aligned_word)
        #### Related to the label.
        query_node = change_not(query_node)
        query_node = assign_head(query_node)
        # Mark NT with distinct symbols 
        query_node = mark_nt(query_node)
        
        #### Related to the bound 
        # calculating inside variable
        query_node = calculate_v(query_node) 
        # calculating outside variable
        query_node = calculate_outside_v(query_node)
        # calculating bound variable
        query_node = calculate_bound(query_node)
        # PRUNE all variable node
        query_node = prune_node(query_node)
        
        # aligning unaligned word in the source side, it is aligned to the topmost node
        query_node,_ = align_unaligned_source(query_node,0,len(sent)-1,aligned_word)
        
        # frontier node   
        query_node = mark_frontier_node(query_node,set())
        
        # change 'w1 w2' entity into w1_w2       
        query_node = transform_multi_words_entity(query_node)
       
        if args.three_sync:
            stopwords = set()
            with open(args.three_sync) as sw_list:
                for sw in sw_list:
                    stopwords.add(sw.strip())
            consider_stopwords(query_node,sent,stopwords)
        lexical_acq(query_node,sent,[],args.merge_unary)
        
        count += 1
        if (args.verbose):
            print index, "|||",  sent_line.strip()
            print_node(query_node,stream=sys.stdout) 
            print_node_list(query_node)

        #print >> fp, print_traverse_rule(query_node)[1]
        rules = []
        compose_rule(rules, query_node, args.max_size)
        rules = map(lambda x:rename_non_terminal(x,False),rules)
        rules = check_rules(rules,cycle_map)

        for rule in rules:
            r = rule
            if r != None:
                print index, "|||", str(r)
        if (args.verbose):print '----------------------------------------------------------------------------'
    #### Closing all files
    map(lambda x: x.close(), [inp_file, sent_file, fol_file, align_file])

    #### Printing stats
    print >> sys.stderr, "Finish extracting rule from %d pairs." % (count)