def main(): parser = argparse.ArgumentParser(description="Run Geoparse Alignment Input Generator") parser.add_argument('--input',type=str,required=True,help="Input file of geoparse") parser.add_argument('--osent',type=str,required=True,help="Directory where sentence is outputed") parser.add_argument('--ologic',type=str,required=True,help="Directory where logical-form is outputed") parser.add_argument('--output',type=str,help="Directory where verbosed output is generated") parser.add_argument('--manual',type=str) args = parser.parse_args() if args.manual: with open(args.manual) as fp: for line in fp: a, b = line.strip().split() manual_align[a] = b linecount = 0 inp = open(args.input,"r") out_sent = open(args.osent, "w") out_sent_g = open(args.osent + ".gin", "w") out_log = open(args.ologic,"w") out_log_g = open(args.ologic + ".gin", "w") out_log_p = open(args.ologic + ".parse", "w") out_w = open(args.osent + ".word", "w") out = None if args.output: out = open(args.output, "w") #### For every well formed query in file extract the rule! for line in inp: line = line.strip() (sentence_node, query_node) = extract(line,0,"")[0][0].childs #### Sentence and node sentence = [node.label for node in sentence_node.childs] if sentence[-1] == "'.'" or sentence[-1] == "?": sentence = sentence[:-1] # print_node(sentence_node) # print_node(query_node) for word in sentence: words.add(word) #### logical rule extraction var_map = defaultdict(lambda: len(var_map)+1) query_node = construct_query_node(query_node,[]) query_node = change_var_to_x(query_node,var_map) rules = transform_into_rule([],query_node,start=True) #### Printing out_sent.write(" ".join(sentence) + "\n") out_sent_g.write(" ".join(sentence) + "\n") (logical_rule, logical_rule_giza) = ([str_logical_rule(rule[1],rule[4]) for rule in rules], [str_giza_in_rule(rule) for rule in rules]) if (len(logical_rule) != len(logical_rule_giza)): print >> sys.stderr, "Rule size doesn't match", logical_rule_giza, logical_rule out_log.write(" ".join(logical_rule) + "\n") out_log_g.write(" ".join(logical_rule_giza)+ "\n") out_log_p.write(query_representation(query_node,{value:key for key, value in var_map.items()},input_generator=True) +"\n") if args.output: out.write(" ".join(sentence) + "\n") for rule in rules: out.write(str_logical_rule(rule[1],rule[4]) + " ||| " + str_giza_in_rule(rule)+ "\n") out.write("------------------------------------\n") linecount += 1 inp.close() out_sent.close() out_log.close() if args.output: out.close() #### ADDITIONAL information for alignment #### Every word is aligned to itself for i in range(0,10): for word in sorted(words): out_sent_g.write(word + "\n") out_log_g.write(word + "\n") out_w.write(word +"\n") for word1, word2 in manual_align.items(): out_sent_g.write(word1 + "\n") out_log_g.write(word2 + "\n") out_w.write(word1 + "\n") #### Handle something like 'south dakota' so add alignment south -> south_dakota and dakota -> south_dakota for literals in many_literals: literals = literals.split(' ') for word in literals: out_sent_g.write(word + "\n") out_log_g.write('_'.join(literals) + "\n") out_sent_g.close() out_log_g.close() print >> sys.stderr, "Successfully extracting :", linecount, "pair(s)."
def main(): args = parse_argument() #### file pointer inp_file = open(args.input,'r') sent_file = open(args.sent,'r') fol_file = open(args.fol,'r') align_file = open(args.align,'r') #### counter count = 0 #### map for validation cycle_map = defaultdict(lambda:set()) #fp = open("geoquery.fparse","w") #### For input-sentence-fol-alignment for (index,(inp_line,sent_line, fol_line, align_line)) in enumerate(zip(inp_file,sent_file,fol_file,align_file)): inp_line = inp_line.strip() query = extract(inp_line)[0][0] (sentence_node, query_node) = query.childs # get the tree representation by extracting the geoquery #### strip, split! (sent, fol, align_line) = map(lambda x: x.strip().split(), (sent_line, fol_line, align_line)) #### creating mapping from F -> [w1,w2,...] where w1 w2 are words aligned to F(OL) s2l = defaultdict(lambda:set()) for align in align_line: (sent_a, fol_a) = map(int, align.split('-')) if fol[fol_a] not in s2l: s2l[fol[fol_a]] = set() s2l[fol[fol_a]].add(sent_a) #### Doing some node annotation, and bound node to which word and variable it is aligned to. var_map = defaultdict(lambda:len(var_map)+1) #### Query Node + preprocessing id = [] query_node = construct_query_node(query_node,id) query_node = relax_not_node(query_node,id) # alter A->x1, B->x2, and so on query_node = change_var_to_x(query_node,var_map) # change '' to CONJUNCTION and \+ to NEGATION query_node = change_conj(query_node) #### Related to the Word alignment. aligned_word = set() # give information about which words that are aligned to node query_node = calculate_e(query_node,s2l,aligned_word) #### Related to the label. query_node = change_not(query_node) query_node = assign_head(query_node) # Mark NT with distinct symbols query_node = mark_nt(query_node) #### Related to the bound # calculating inside variable query_node = calculate_v(query_node) # calculating outside variable query_node = calculate_outside_v(query_node) # calculating bound variable query_node = calculate_bound(query_node) # PRUNE all variable node query_node = prune_node(query_node) # aligning unaligned word in the source side, it is aligned to the topmost node query_node,_ = align_unaligned_source(query_node,0,len(sent)-1,aligned_word) # frontier node query_node = mark_frontier_node(query_node,set()) # change 'w1 w2' entity into w1_w2 query_node = transform_multi_words_entity(query_node) if args.three_sync: stopwords = set() with open(args.three_sync) as sw_list: for sw in sw_list: stopwords.add(sw.strip()) consider_stopwords(query_node,sent,stopwords) lexical_acq(query_node,sent,[],args.merge_unary) count += 1 if (args.verbose): print index, "|||", sent_line.strip() print_node(query_node,stream=sys.stdout) print_node_list(query_node) #print >> fp, print_traverse_rule(query_node)[1] rules = [] compose_rule(rules, query_node, args.max_size) rules = map(lambda x:rename_non_terminal(x,False),rules) rules = check_rules(rules,cycle_map) for rule in rules: r = rule if r != None: print index, "|||", str(r) if (args.verbose):print '----------------------------------------------------------------------------' #### Closing all files map(lambda x: x.close(), [inp_file, sent_file, fol_file, align_file]) #### Printing stats print >> sys.stderr, "Finish extracting rule from %d pairs." % (count)