def quantifier_x_VP(self, sent_ccgtree): """ add ``quant (N, VP)'' to logical_sents """ # match structure: # most A # NP/N N regular VP # -------------- ---------------- # NP_1 S\NP = VP # node_most = sent_ccgtree.getLeftMostLeaf(sent_ccgtree.root) try: assert node_most.wholeStr in QUANTIFIERS except AssertionError: print("something wrong adding sent to logicalSys:") sent_ccgtree.printSent() exit() quant = node_most.wholeStr N, VP = None, None if node_most.sisters: N = node_most.sisters[0] if node_most.parent.sisters: VP = node_most.parent.sisters[0] if VP and VP.cat.typeWOfeats == r"S\NP": if N: log_sent = LogicalSentence(quant, N, VP) self.add_logical_sent_helper(log_sent) else: eprint('\ndid not add to logical sys 2.1:') sent_ccgtree.printSent() else: eprint('\ndid not add to logical sys 2.2:') sent_ccgtree.printSent()
def main(): if '-s' in sys.argv: save_cache() elif '-t' in sys.argv: test() else: eprint('required arg: -s/-t')
def subst(line, quantifier, fh_log, s_pattern, sent_id, verbose=False): """ substitute: word = most/least """ # find the index of `at' in line.split() idx_at = -1 line_list = line.split() num_tokens = len(line_list) for idx, word in enumerate(line_list): if idx == num_tokens - 2: break if word.lower() == 'at' and \ line_list[idx + 1].lower() == quantifier: # 'most': idx_at = idx break # replace `at most 10' with `no' m = pat[quantifier].search(line) if m: original = m.group(0) else: return line eprint('original:', original) if quantifier == 'most': after = "no" elif quantifier == 'least': after = "some" line = pat[quantifier].sub(after, line) # log fh_log.write( s_pattern.format(str(sent_id), original, after, str(idx_at), len(line_list))) return line
def main(): if len(sys.argv) < 4: eprint(message) else: filename = sys.argv[1] parser = sys.argv[2] filename_log = sys.argv[3] convert2transccg(filename, parser, filename_log)
def quantifier_x_be_y(self, sent_ccgtree): """ add ``quant (N, NP_2)'' to logical_sents """ # TODO how to know if this "BE" is the main verb # "BE" can be at multiple places # match structure: # most A are B # NP/N N (S\NP)/NP NP_2 = X # -------------- ---------------- # NP_1 S\NP = VP # node_most = sent_ccgtree.getLeftMostLeaf(sent_ccgtree.root) try: assert node_most.wholeStr in QUANTIFIERS except AssertionError: print("something wrong adding sent to logicalSys:") sent_ccgtree.printSent() exit() quant = node_most.wholeStr N, X, VP = None, None, None if node_most.sisters: N = node_most.sisters[0] if node_most.parent.sisters: VP = node_most.parent.sisters[0] if VP and (VP.cat.typeWOfeats == r"S\NP") and \ VP.children and (VP.children[0].wholeStr == "BE"): # case a: X = plural nouns # case b: X = a person? we only need ``person''? # case c: Every person is great. be: (S\NP)/(S\NP), great: S\NP # case d: Every one is good at dancing. same as above # case e: Every one is about to dance. same as above # case f: Several committee members are from Scandinavia. PP # case b if len(VP.children[1].children) == 2 and \ VP.children[1].children[0].wholeStr in ["A", "AN"]: X = VP.children[1].children[1] # case a,c,d,e,f elif VP.children[1].cat.typeWOfeats in {"NP", r"S\NP", "PP"}: X = VP.children[1] else: X = VP.children[1] if N and X: log_sent = LogicalSentence(quant, N, X) self.add_logical_sent_helper(log_sent) else: eprint('\ndid not add to logical sys 1:') sent_ccgtree.printSent() else: # "BE" is not the main verb! self.quantifier_x_VP(sent_ccgtree)
def main(): USAGE = """ generate inferences for SICK usage: python generate.py chunk_id chunk_id={trial, 1-9} """ if len(sys.argv) != 2: print(USAGE) exit() elif sys.argv[1] not in ['trial'] + list(range(1, 10)): print(USAGE) exit() start_time = time.time() generate(sys.argv[1]) eprint("\n\n--- %s seconds ---" % (time.time() - start_time))
def preprocess(fn): """ produce a clean file named: test.tok.clean and log file: test.tok.preprocess.log """ sent_id = -1 fh_log = open(fn + '.preprocess.log', 'w') fh_log.write("sentId,before,after,idx,len_sent\n") fh_clean = open(fn + '.clean', 'w') s_pattern = "{},{},{},{},{}\n" # p2a = P2A_transformer(spacy.load('en')) # corenlp = StanfordCoreNLP('http://localhost', port=9000, lang='en') eprint('\npreprocessing...') with open(fn) as f: for line in f: line = line.strip() if line == "": continue sent_id += 1 # print('\npreprocessing:', sent_id) # line = line.lower() eprint('\nbefore :', line) # with passitve to active transformation # line = preprocess_line(line, fh_log, s_pattern, sent_id, p2a, corenlp) # no passive to active transformation line = preprocess_line(line, fh_log, s_pattern, sent_id) eprint('after :', line) # write to clean file fh_clean.write(line) fh_clean.write('\n') fh_log.close() fh_clean.close() eprint('...done!\n')
def save_cache(): """ for each noun in train[:500] + trials save the 5 most frequent hypers and hypos """ sick_ids = sick_ans.ids_trial_E[:10] # ids_trial_E_C sick_ids = [ 211, 1412, 1495, 2557, 2829, 2898, 3250, 4015, 4066, 4135, 4342, 4360, 4661, 4916, 5030, 5113, 5498, 5806, 6756 ] # 4916=guitar, musical instrument # 4006 throw, hurl # 6756 large big sick_ids = [5498, 5806, 4916, 4066, 6756] # got all these trees = CCGtrees("sick_uniq.raw.tok.preprocess.log") trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt") for id_to_solve in sick_ids: eprint("-" * 50) print("sick", id_to_solve) P = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=False) H = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=False) eprint("P:", end="") P.printSent_raw_no_pol(stream=sys.stderr) eprint("H:", end="") H.printSent_raw_no_pol(stream=sys.stderr) k = Knowledge() k.update_word_lists(P) # nouns, subSecAdj, etc. k.update_word_lists(H) # k.update_modifier() assign_all_relations_wordnet(k)
def generate(chunk): n_rep = 3 trees = CCGtrees("sick_uniq.raw.tok.preprocess.log") trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt") if chunk == "trial": sick_ids = sick_ans.ids_trial_E_C # sick_ids = [1237] if chunk == "1": sick_ids = sick_ans.ids_train[:500] elif chunk == "2": sick_ids = sick_ans.ids_train[500:1000] elif chunk == "3": sick_ids = sick_ans.ids_train[1000:1500] elif chunk == "4": sick_ids = sick_ans.ids_train[1500:2000] elif chunk == "5": sick_ids = sick_ans.ids_train[2000:2500] elif chunk == "6": sick_ids = sick_ans.ids_train[2500:3000] elif chunk == "7": sick_ids = sick_ans.ids_train[3000:3500] elif chunk == "8": sick_ids = sick_ans.ids_train[3500:4000] elif chunk == "9": sick_ids = sick_ans.ids_train[4000:4500] # elif chunk == 10: sick_ids = sick_ans.ids_train[4500:] for id_to_solve in sick_ids: P = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=True) H = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=True) # ----------------------------- # passive to active # my_act = p2a.pass2act(P.printSent_raw_no_pol(stream=sys.stdout, verbose=False)) # my_act = my_act.rstrip('. ') # eprint('original:', P.printSent_raw_no_pol(stream=sys.stdout, verbose=False)) # eprint('active:', my_act) # ----------------------------- # initialize s s = SentenceBase(gen_inf=True) # ----------------------------- # build knowledge k = Knowledge() k.build_manual_for_sick() # TODO k.build_quantifier( all_quant=False) # all = every = each < some = a = an, etc. k.build_morph_tense() # man = men, etc. # fix trees and update knowledge k, sentBase s # P P.fixQuantifier() P.fixNot() k.update_sent_pattern(P) # patterns like: every X is NP k.update_word_lists(P) # nouns, subSecAdj, etc. s.add_P_str(P) # H H.fixQuantifier() H.fixNot() k.update_word_lists(H) # need to find nouns, subSecAdjs, etc. in H s.add_H_str_there_be(H) # transform ``there be'' in H k.update_modifier() # adj + n < n, n + RC/PP < n, v + PP < v s.k = k # ----------------------------- # polarize eprint("\n*** polarizing ***\n") for p in s.Ps_ccgtree: # Ps try: p.mark() p.polarize() p.getImpSign() except (ErrorCCGtree, ErrorCompareSemCat) as e: # , AssertionError eprint("cannot polarize:", p.wholeStr) eprint("error:", e) except AssertionError as e: eprint("assertion error!") eprint(e) p.printSent() exit() eprint("P: ", end="") p.printSent_raw_no_pol(stream=sys.stderr) eprint("H: ", end="") s.H_ccgtree.printSent_raw_no_pol(stream=sys.stderr) eprint("\n*** replacement ***\n") try: ans = s.solve_ids(depth_max=n_rep, fracas_id=None) except (ErrorCompareSemCat, ErrorCCGtree): continue eprint("\n--- tried ** {} ** inferences:".format(len(s.inferences))) # for inf in sorted(s.inferences): print(inf) for inf in s.inferences_tree: print("{}\t{}\t{}\t{}".format( id_to_solve, P.printSent_no_pol(stream=sys.stdout), inf.printSent_no_pol(stream=sys.stdout), "ENTAILMENT")) eprint("\n--- tried ** {} ** contradictions:".format(len( s.contras_str))) for contra in sorted(s.contras_str): print("{}\t{}\t{}\t{}".format( id_to_solve, P.printSent_no_pol(stream=sys.stdout), contra.lower(), "CONTRADICTION"))
def convert2transccg(filename, parser, filename_log): """ input: - easyccg output (tmp.easyccg.parsed.txt) or - candc output (tmp.candc.parsed.xml) input is read into my CCGtree format. then traverse the tree and print xml to stdout return # of sents not polarized """ trees = CCGtrees(filename_log) if parser == 'easyccg': trees.readEasyccgStr(filename) #('tmp.easyccg.parsed.txt') raw_sentences = open( filename.replace(".easyccg.parsed.txt", "") + ".tok.clean").readlines() elif parser == 'candc': trees.readCandCxml(filename) #('tmp.candc.parsed.xml') raw_sentences = open( filename.replace(".candc.parsed.xml", "") + ".tok.clean").readlines() elif parser == 'depccg': # same as easyccg trees.readEasyccgStr(filename) raw_sentences = open( filename.replace(".depccg.parsed.txt", "") + ".tok.clean").readlines() else: eprint('parser can only be: easyccg, candc, depccg') exit() # ---------------------------------- # mark and polarize N_polar = 0 N_unpolar = 0 N_unparsed = 0 fh_polarized_trees = open(filename + ".polarized", "w") # sent_parsed = True # idx_cant_polarize = {} for idx in range(len(raw_sentences)): # build the tree here t = trees.build_one_tree(idx, parser, use_lemma=False) # eprint(trees.easyccg_str.get(idx, None)) # print(t) # return if t in ["failed_to_parse", "parse_exception"]: # easyccg failed to parse the sent eprint('easyccg failed to parse the sent') eprint(raw_sentences[idx]) sent = raw_sentences[idx].replace(" ", "= ").replace( "\n", "=\n") # = for every token fh_polarized_trees.write(sent) N_unparsed += 1 else: # t is a tree # fix tree t.fixQuantifier() try: t.fixNot() except AttributeError: pass if parser in ['candc']: t.fixRC() # only fix RC for candc try: t.mark() t.polarize() t.getImpSign() N_polar += 1 except (ErrorCompareSemCat, ErrorCCGtree, AssertionError, AttributeError, ErrorCat) as e: eprint(e) eprint('-- cannot polarize sent: ', end='') N_unpolar += 1 # t.printSent(stream=sys.stderr) fh_polarized_trees.write(t.printSent_raw(stream=sys.stderr)) fh_polarized_trees.write("\n") eprint() fh_polarized_trees.close() eprint("\n\n===========\npolarized {} trees\n" "unable to parse {} trees\n" "unable to polarize {} trees".format(N_polar, N_unparsed, N_unpolar)) # ---------------------------------- print( """<?xml version='1.0' encoding='UTF-8'?>\n<root>\n<document>\n<sentences>""" ) for idx, t in trees.trees.items(): if t in ["failed_to_parse", "parse_exception"]: continue print("<sentence>") # ---------------------------- # print tokens print("<tokens>") counter = 0 for token in t.leafNodes: # depth,cat,chunk,entity,lemma,pos,span,start,word token_id = "t" + str(idx) + '_' + str(counter) ETtype = token.cat.semCat.__str__() polarity = getPolarityAsArrow(token) print( str_token.format(token.start, token.span, token.pos, token.chunk, token.entity, token.cat.originalType, token_id, token.word, token.lemma, ETtype, polarity)) counter += 1 print("</tokens>") # ---------------------------- # print nodes # <ccg root="s0_sp0" id="s0_ccg0"> print('<ccg root="s{}_sp0" id="s{}_ccg0">'.format(str(idx), str(idx))) # tree # in-order traversal of tree to get span_id of non term node traverse2get_span_id(t.root, -1, idx) # in order traversal of tree traverse(t.root, 0, idx) print("</ccg>") print("</sentence>") print("""</sentences>\n</document>\n</root>""")
def compute_DET_rule(self): """ apply DET_rule on all logical_sents return inferences to be added to the fringe! DET x y All x z ------------------------- DET: several cases based on pos of y and z DET x (y ^ z) all possibilities of y ^ z are stored in a list, returned from conjoin_two_terms(y, z) """ # step 1: find 'all/each/every x z' to_loop = [ self.logical_sents['ALL']['log_sents'], self.logical_sents['EACH']['log_sents'], self.logical_sents['EVERY']['log_sents'] ] for lst in to_loop: for log_sent in lst: # log_sent: All x z x = log_sent.term1 # NonTermNode z = log_sent.term2 # x and its equivalents, a set of str x_set = set( [i.ccgtree.root for i in self.KB.frags[x.wholeStr].equal] + [x]) x_set_str = set([i.wholeStr for i in x_set]) # print('\n\n') # print(x_set_str) # step 2: find DET x y for quant in QUANTIFIERS: # see if x and its equivalents appear as term1 in logSent for this quant if not any([ i in self.logical_sents[quant]['X'] for i in x_set_str ]): continue # then skip the following for log_sent2 in self.logical_sents[quant]['log_sents']: # log_sent2: DET x y # if log_sent2.term1 is same as x, or the equavalents of x if log_sent2.term1.wholeStr in x_set_str: # found y!! DET rule here!! # order is taken care of in the function y = log_sent2.term2 y_and_z_list = self.conjoin_two_terms( y, z) # a list of NonTermNode for y_and_z in y_and_z_list: print("\n\nquant", quant) print("x", x.wholeStr) print("y_and_z", y_and_z.wholeStr, y_and_z.cat.typeWOfeats) # apart from adding ``quant x y_and_z'', # we also need to add any quantifier greater or equal to quant # e.g. every x y_and_z = all x y_and_z = each x y_and_z if quant in QUANT_GEQ: new_quants_str = QUANT_GEQ[quant] else: new_quants_str = [quant] ### !!! KEY STEP !!! ### generate inf for new_quant_str in new_quants_str: for new_x in x_set: inf_log_sent = LogicalSentence( quantifier=new_quant_str, term1=new_x, term2=y_and_z) inf_nat_sent = inf_log_sent.to_nat_sent( ) eprint("# DET_rule add to fringe: ", end="") inf_nat_sent.printSent(sys.stderr) yield inf_nat_sent
def find_head_noun(node_det, ccg_tree): """ find the head noun of a determiner in a tree """ # - configuration 1: the brown bulldog who every brown mammal moved-towards # bulldog RC # --------------- # N N\N # brown ------------- # N/N N # the -------------- # NP/N N = sister # - configuration 2: the brown dog # - configuration 3: the dog # - configuration 4: a good poodle who was happy waltzed # ---------------------------------------------- # solution 1: find the first leafnode that is of category N, starting # from the sister of the determiner # -- this does NOT work for configuration 4, b/c (good poodle) and (RC) # both have two children # sister = node_det.sisters[0] # head_noun = sister # while not head_noun_test(head_noun): # old_head_noun = head_noun # if len(head_noun.children) == 0: # eprint("something wrong finding head noun, no children for {}".format(head_noun)) # eprint(ccg_tree.tree_str()) # exit() # for child in head_noun.children: # if head_noun_test(child): # head_noun = child # found head_noun # break # if len(child.children) != 0: # head_noun = child # next, explore the child which has children # if old_head_noun == head_noun: # eprint("\n!! something wrong finding head noun") # eprint("old head noun:", old_head_noun) # eprint("new head noun:", head_noun) # eprint(ccg_tree.tree_str()) # ccg_tree.printTree(stream=sys.stderr) # eprint("\n") # exit() # if head_noun is None: # eprint("head noun for det **{}** is None".format(node_det)) # eprint(ccg_tree.tree_str()) # exit() # ------------------------------ # solution 2: find the *highest* leafnode (N) under the NP depth_min = -1 head_noun = None nodes_under_NP = [node_det.sisters[0]] while nodes_under_NP: poped_node = nodes_under_NP.pop() if poped_node.children: # non term node nodes_under_NP.extend(poped_node.children) else: # leaf node if poped_node.cat.typeWOfeats == "N" and poped_node.depth > depth_min: head_noun = poped_node if head_noun is None: eprint("\n!! something wrong finding head noun") eprint(ccg_tree.tree_str()) ccg_tree.printTree(stream=sys.stderr) eprint("\n") exit() return head_noun