def test(): trees = CCGtrees("fracas_1_80.raw.tok.preprocess.log") trees.readEasyccgStr("fracas_1_80.raw.easyccg.parsed.txt") from infer import Knowledge k = Knowledge() k.build_quantifier() # all = every = each < some = a = an, etc. k.build_morph_tense() # man = men, etc. LS = LogicalSystem(k) LS.manual() for i in range(len(trees.trees)): if i not in [40, 41, 42]: continue # if i not in [64, 65, 66]: continue # if i not in [44, 45, 46]: continue tree = trees.trees[i] tree.fixQuantifier() tree.fixNot() LS.add_logical_sent(tree) k.update_sent_pattern(tree) # patterns like: every X is NP k.update_word_lists(tree) # nouns, subSecAdj, etc. k.print_knowledge() LS.compute_DET_rule() print(LS)
def solveSick(self): start_time = time.time() trees = CCGtrees(fn_log="sick_uniq.raw.tok.preprocess.log") # read parsed trees from different parsers # trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt") trees.readEasyccgStr("sick_uniq.raw.depccg.parsed.txt") # --- sanity check --- # for p in IDX_P["055"]: # print(trees.trees[p]) # print(trees.trees[IDX_H["065"]]) # --- sanity check: done --- # trials if self.sick_id[0] == 'trial': sick_ids = sick_ans.ids_trial_E_C # ids_trial_E_C sick_ids = sick_ans.ids_trial elif self.sick_id[0] == 'wrongs': sick_ids = sick_ans.ids_wrongs_U_test elif self.sick_id[0] == 'trial_c': sick_ids = sick_ans.ids_trial_C elif self.sick_id[0] == 'trial_e': sick_ids = sick_ans.ids_trial_E # test on train data elif self.sick_id[0] == 'train': sick_ids = sick_ans.ids_train[:1000] # test on test elif self.sick_id[0] == 'test': sick_ids = sick_ans.ids_test[:] # test on one word diff elif self.sick_id[0] == 'onediff': ids_one_diff = set(sick_ans.ids_one_diff) sick_ids = [i for i in sick_ans.ids_train if i in ids_one_diff] # test one or more problems elif self.sick_id[0] != 'all': sick_ids = [ int(i) for i in self.sick_id if int(i) in self.sick2uniq ] # all problems else: sick_ids = sorted(self.sick2uniq) self.solveSick_helper(sick_ids, trees) print("\n\n--- %s seconds ---" % (time.time() - start_time))
def solve(self): start_time = time.time() trees = CCGtrees(fn_log="med_adjectives.tok.preprocess.log") # read parsed trees from different parsers trees.readEasyccgStr("med_adjectives.easyccg.parsed.txt") # trees.readEasyccgStr("sick_uniq.raw.depccg.parsed.txt") ids = list(range(0, 10)) self.solve_helper(ids, trees) print("\n\n--- %s seconds ---" % (time.time() - start_time))
def solveFracas(fracas_id, n_rep, print_k, sections): """ read in fracas """ start_time = time.time() if sections == "1": trees = CCGtrees("fracas_1_80.raw.tok.preprocess.log") trees.readEasyccgStr("fracas_1_80.raw.easyccg.parsed.txt") from fracas_index import IDX_P, IDX_H, UNDEFINED, ANSWERS elif sections == "56": trees = CCGtrees("fracas_sec_5_6.raw.tok.preprocess.log") trees.readEasyccgStr("fracas_sec_5_6.raw.easyccg.parsed.txt") from fracas_index import IDX_P_SEC_5_6 as IDX_P from fracas_index import IDX_H_SEC_5_6 as IDX_H from fracas_index import UNDEFINED_SEC_5_6 as UNDEFINED from fracas_index import ANSWERS_SEC_5_6 as ANSWERS else: print("wrong section! -sec can only be: 1, 56") exit() # --- sanity check --- # for p in IDX_P["055"]: # print(trees.trees[p]) # print(trees.trees[IDX_H["065"]]) # --- sanity check: done --- # test one problem if fracas_id != 'all': solveFracas_one(fracas_id, trees, n_rep, print_k, IDX_P, IDX_H, UNDEFINED, ANSWERS) # all problems else: y_pred = [] for fracas_id in sorted(IDX_P): # solve if fracas_id in UNDEFINED: continue # skipped undefined problems ans = solveFracas_one(fracas_id, trees, n_rep, print_k, IDX_P, IDX_H, UNDEFINED, ANSWERS) y_pred.append(ans) y_true = [ANSWERS[fracas_id] for fracas_id in sorted(ANSWERS) \ if fracas_id not in UNDEFINED] ids = [fracas_id for fracas_id in sorted(ANSWERS) \ if fracas_id not in UNDEFINED] print('\ny_pred:', y_pred) print('y_true:', y_true) print(accuracy(ids, y_pred, y_true)) print("\n\n--- %s seconds ---" % (time.time() - start_time))
def save_cache(): """ for each noun in train[:500] + trials save the 5 most frequent hypers and hypos """ sick_ids = sick_ans.ids_trial_E[:10] # ids_trial_E_C sick_ids = [ 211, 1412, 1495, 2557, 2829, 2898, 3250, 4015, 4066, 4135, 4342, 4360, 4661, 4916, 5030, 5113, 5498, 5806, 6756 ] # 4916=guitar, musical instrument # 4006 throw, hurl # 6756 large big sick_ids = [5498, 5806, 4916, 4066, 6756] # got all these trees = CCGtrees("sick_uniq.raw.tok.preprocess.log") trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt") for id_to_solve in sick_ids: eprint("-" * 50) print("sick", id_to_solve) P = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=False) H = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=False) eprint("P:", end="") P.printSent_raw_no_pol(stream=sys.stderr) eprint("H:", end="") H.printSent_raw_no_pol(stream=sys.stderr) k = Knowledge() k.update_word_lists(P) # nouns, subSecAdj, etc. k.update_word_lists(H) # k.update_modifier() assign_all_relations_wordnet(k)
def generate(chunk): n_rep = 3 trees = CCGtrees("sick_uniq.raw.tok.preprocess.log") trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt") if chunk == "trial": sick_ids = sick_ans.ids_trial_E_C # sick_ids = [1237] if chunk == "1": sick_ids = sick_ans.ids_train[:500] elif chunk == "2": sick_ids = sick_ans.ids_train[500:1000] elif chunk == "3": sick_ids = sick_ans.ids_train[1000:1500] elif chunk == "4": sick_ids = sick_ans.ids_train[1500:2000] elif chunk == "5": sick_ids = sick_ans.ids_train[2000:2500] elif chunk == "6": sick_ids = sick_ans.ids_train[2500:3000] elif chunk == "7": sick_ids = sick_ans.ids_train[3000:3500] elif chunk == "8": sick_ids = sick_ans.ids_train[3500:4000] elif chunk == "9": sick_ids = sick_ans.ids_train[4000:4500] # elif chunk == 10: sick_ids = sick_ans.ids_train[4500:] for id_to_solve in sick_ids: P = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=True) H = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve), "easyccg", use_lemma=True) # ----------------------------- # passive to active # my_act = p2a.pass2act(P.printSent_raw_no_pol(stream=sys.stdout, verbose=False)) # my_act = my_act.rstrip('. ') # eprint('original:', P.printSent_raw_no_pol(stream=sys.stdout, verbose=False)) # eprint('active:', my_act) # ----------------------------- # initialize s s = SentenceBase(gen_inf=True) # ----------------------------- # build knowledge k = Knowledge() k.build_manual_for_sick() # TODO k.build_quantifier( all_quant=False) # all = every = each < some = a = an, etc. k.build_morph_tense() # man = men, etc. # fix trees and update knowledge k, sentBase s # P P.fixQuantifier() P.fixNot() k.update_sent_pattern(P) # patterns like: every X is NP k.update_word_lists(P) # nouns, subSecAdj, etc. s.add_P_str(P) # H H.fixQuantifier() H.fixNot() k.update_word_lists(H) # need to find nouns, subSecAdjs, etc. in H s.add_H_str_there_be(H) # transform ``there be'' in H k.update_modifier() # adj + n < n, n + RC/PP < n, v + PP < v s.k = k # ----------------------------- # polarize eprint("\n*** polarizing ***\n") for p in s.Ps_ccgtree: # Ps try: p.mark() p.polarize() p.getImpSign() except (ErrorCCGtree, ErrorCompareSemCat) as e: # , AssertionError eprint("cannot polarize:", p.wholeStr) eprint("error:", e) except AssertionError as e: eprint("assertion error!") eprint(e) p.printSent() exit() eprint("P: ", end="") p.printSent_raw_no_pol(stream=sys.stderr) eprint("H: ", end="") s.H_ccgtree.printSent_raw_no_pol(stream=sys.stderr) eprint("\n*** replacement ***\n") try: ans = s.solve_ids(depth_max=n_rep, fracas_id=None) except (ErrorCompareSemCat, ErrorCCGtree): continue eprint("\n--- tried ** {} ** inferences:".format(len(s.inferences))) # for inf in sorted(s.inferences): print(inf) for inf in s.inferences_tree: print("{}\t{}\t{}\t{}".format( id_to_solve, P.printSent_no_pol(stream=sys.stdout), inf.printSent_no_pol(stream=sys.stdout), "ENTAILMENT")) eprint("\n--- tried ** {} ** contradictions:".format(len( s.contras_str))) for contra in sorted(s.contras_str): print("{}\t{}\t{}\t{}".format( id_to_solve, P.printSent_no_pol(stream=sys.stdout), contra.lower(), "CONTRADICTION"))
def convert2transccg(filename, parser, filename_log): """ input: - easyccg output (tmp.easyccg.parsed.txt) or - candc output (tmp.candc.parsed.xml) input is read into my CCGtree format. then traverse the tree and print xml to stdout return # of sents not polarized """ trees = CCGtrees(filename_log) if parser == 'easyccg': trees.readEasyccgStr(filename) #('tmp.easyccg.parsed.txt') raw_sentences = open( filename.replace(".easyccg.parsed.txt", "") + ".tok.clean").readlines() elif parser == 'candc': trees.readCandCxml(filename) #('tmp.candc.parsed.xml') raw_sentences = open( filename.replace(".candc.parsed.xml", "") + ".tok.clean").readlines() elif parser == 'depccg': # same as easyccg trees.readEasyccgStr(filename) raw_sentences = open( filename.replace(".depccg.parsed.txt", "") + ".tok.clean").readlines() else: eprint('parser can only be: easyccg, candc, depccg') exit() # ---------------------------------- # mark and polarize N_polar = 0 N_unpolar = 0 N_unparsed = 0 fh_polarized_trees = open(filename + ".polarized", "w") # sent_parsed = True # idx_cant_polarize = {} for idx in range(len(raw_sentences)): # build the tree here t = trees.build_one_tree(idx, parser, use_lemma=False) # eprint(trees.easyccg_str.get(idx, None)) # print(t) # return if t in ["failed_to_parse", "parse_exception"]: # easyccg failed to parse the sent eprint('easyccg failed to parse the sent') eprint(raw_sentences[idx]) sent = raw_sentences[idx].replace(" ", "= ").replace( "\n", "=\n") # = for every token fh_polarized_trees.write(sent) N_unparsed += 1 else: # t is a tree # fix tree t.fixQuantifier() try: t.fixNot() except AttributeError: pass if parser in ['candc']: t.fixRC() # only fix RC for candc try: t.mark() t.polarize() t.getImpSign() N_polar += 1 except (ErrorCompareSemCat, ErrorCCGtree, AssertionError, AttributeError, ErrorCat) as e: eprint(e) eprint('-- cannot polarize sent: ', end='') N_unpolar += 1 # t.printSent(stream=sys.stderr) fh_polarized_trees.write(t.printSent_raw(stream=sys.stderr)) fh_polarized_trees.write("\n") eprint() fh_polarized_trees.close() eprint("\n\n===========\npolarized {} trees\n" "unable to parse {} trees\n" "unable to polarize {} trees".format(N_polar, N_unparsed, N_unpolar)) # ---------------------------------- print( """<?xml version='1.0' encoding='UTF-8'?>\n<root>\n<document>\n<sentences>""" ) for idx, t in trees.trees.items(): if t in ["failed_to_parse", "parse_exception"]: continue print("<sentence>") # ---------------------------- # print tokens print("<tokens>") counter = 0 for token in t.leafNodes: # depth,cat,chunk,entity,lemma,pos,span,start,word token_id = "t" + str(idx) + '_' + str(counter) ETtype = token.cat.semCat.__str__() polarity = getPolarityAsArrow(token) print( str_token.format(token.start, token.span, token.pos, token.chunk, token.entity, token.cat.originalType, token_id, token.word, token.lemma, ETtype, polarity)) counter += 1 print("</tokens>") # ---------------------------- # print nodes # <ccg root="s0_sp0" id="s0_ccg0"> print('<ccg root="s{}_sp0" id="s{}_ccg0">'.format(str(idx), str(idx))) # tree # in-order traversal of tree to get span_id of non term node traverse2get_span_id(t.root, -1, idx) # in order traversal of tree traverse(t.root, 0, idx) print("</ccg>") print("</sentence>") print("""</sentences>\n</document>\n</root>""")