Пример #1
0
def test():
    trees = CCGtrees("fracas_1_80.raw.tok.preprocess.log")
    trees.readEasyccgStr("fracas_1_80.raw.easyccg.parsed.txt")
    from infer import Knowledge
    k = Knowledge()
    k.build_quantifier()  # all = every = each < some = a = an, etc.
    k.build_morph_tense()  # man = men, etc.

    LS = LogicalSystem(k)
    LS.manual()

    for i in range(len(trees.trees)):
        if i not in [40, 41, 42]: continue
        # if i not in [64, 65, 66]: continue
        # if i not in [44, 45, 46]: continue
        tree = trees.trees[i]
        tree.fixQuantifier()
        tree.fixNot()
        LS.add_logical_sent(tree)

        k.update_sent_pattern(tree)  # patterns like: every X is NP
        k.update_word_lists(tree)  # nouns, subSecAdj, etc.

    k.print_knowledge()

    LS.compute_DET_rule()
    print(LS)
Пример #2
0
    def solveSick(self):
        start_time = time.time()

        trees = CCGtrees(fn_log="sick_uniq.raw.tok.preprocess.log")

        # read parsed trees from different parsers
        # trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt")
        trees.readEasyccgStr("sick_uniq.raw.depccg.parsed.txt")

        # --- sanity check ---
        # for p in IDX_P["055"]:
        #     print(trees.trees[p])
        # print(trees.trees[IDX_H["065"]])
        # --- sanity check: done ---

        # trials
        if self.sick_id[0] == 'trial':
            sick_ids = sick_ans.ids_trial_E_C  # ids_trial_E_C
            sick_ids = sick_ans.ids_trial
        elif self.sick_id[0] == 'wrongs':
            sick_ids = sick_ans.ids_wrongs_U_test
        elif self.sick_id[0] == 'trial_c':
            sick_ids = sick_ans.ids_trial_C
        elif self.sick_id[0] == 'trial_e':
            sick_ids = sick_ans.ids_trial_E

            # test on train data
        elif self.sick_id[0] == 'train':
            sick_ids = sick_ans.ids_train[:1000]

            # test on test
        elif self.sick_id[0] == 'test':
            sick_ids = sick_ans.ids_test[:]

            # test on one word diff
        elif self.sick_id[0] == 'onediff':
            ids_one_diff = set(sick_ans.ids_one_diff)
            sick_ids = [i for i in sick_ans.ids_train if i in ids_one_diff]

        # test one or more problems
        elif self.sick_id[0] != 'all':
            sick_ids = [
                int(i) for i in self.sick_id if int(i) in self.sick2uniq
            ]

            # all problems
        else:
            sick_ids = sorted(self.sick2uniq)

        self.solveSick_helper(sick_ids, trees)

        print("\n\n--- %s seconds ---" % (time.time() - start_time))
Пример #3
0
    def solve(self):
        start_time = time.time()

        trees = CCGtrees(fn_log="med_adjectives.tok.preprocess.log")

        # read parsed trees from different parsers
        trees.readEasyccgStr("med_adjectives.easyccg.parsed.txt")
        # trees.readEasyccgStr("sick_uniq.raw.depccg.parsed.txt")

        ids = list(range(0, 10))

        self.solve_helper(ids, trees)

        print("\n\n--- %s seconds ---" % (time.time() - start_time))
Пример #4
0
def solveFracas(fracas_id, n_rep, print_k, sections):
    """  read in fracas  """
    start_time = time.time()

    if sections == "1":
        trees = CCGtrees("fracas_1_80.raw.tok.preprocess.log")
        trees.readEasyccgStr("fracas_1_80.raw.easyccg.parsed.txt")
        from fracas_index import IDX_P, IDX_H, UNDEFINED, ANSWERS

    elif sections == "56":
        trees = CCGtrees("fracas_sec_5_6.raw.tok.preprocess.log")
        trees.readEasyccgStr("fracas_sec_5_6.raw.easyccg.parsed.txt")
        from fracas_index import IDX_P_SEC_5_6 as IDX_P
        from fracas_index import IDX_H_SEC_5_6 as IDX_H
        from fracas_index import UNDEFINED_SEC_5_6 as UNDEFINED
        from fracas_index import ANSWERS_SEC_5_6 as ANSWERS

    else:
        print("wrong section! -sec can only be: 1, 56")
        exit()

    # --- sanity check ---
    # for p in IDX_P["055"]:
    #     print(trees.trees[p])
    # print(trees.trees[IDX_H["065"]])
    # --- sanity check: done ---

    # test one problem
    if fracas_id != 'all':
        solveFracas_one(fracas_id, trees, n_rep, print_k, IDX_P, IDX_H,
                        UNDEFINED, ANSWERS)

    # all problems
    else:
        y_pred = []
        for fracas_id in sorted(IDX_P):  # solve
            if fracas_id in UNDEFINED: continue  # skipped undefined problems
            ans = solveFracas_one(fracas_id, trees, n_rep, print_k, IDX_P,
                                  IDX_H, UNDEFINED, ANSWERS)
            y_pred.append(ans)

        y_true = [ANSWERS[fracas_id] for fracas_id in sorted(ANSWERS) \
                  if fracas_id not in UNDEFINED]
        ids = [fracas_id for fracas_id in sorted(ANSWERS) \
               if fracas_id not in UNDEFINED]

        print('\ny_pred:', y_pred)
        print('y_true:', y_true)
        print(accuracy(ids, y_pred, y_true))

    print("\n\n--- %s seconds ---" % (time.time() - start_time))
Пример #5
0
def save_cache():
    """
    for each noun in train[:500] + trials
    save the 5 most frequent hypers and hypos
    """
    sick_ids = sick_ans.ids_trial_E[:10]  # ids_trial_E_C
    sick_ids = [
        211, 1412, 1495, 2557, 2829, 2898, 3250, 4015, 4066, 4135, 4342, 4360,
        4661, 4916, 5030, 5113, 5498, 5806, 6756
    ]
    # 4916=guitar, musical instrument
    # 4006 throw, hurl
    # 6756 large big
    sick_ids = [5498, 5806, 4916, 4066, 6756]  # got all these

    trees = CCGtrees("sick_uniq.raw.tok.preprocess.log")
    trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt")

    for id_to_solve in sick_ids:
        eprint("-" * 50)
        print("sick", id_to_solve)
        P = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=False)
        H = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=False)
        eprint("P:", end="")
        P.printSent_raw_no_pol(stream=sys.stderr)
        eprint("H:", end="")
        H.printSent_raw_no_pol(stream=sys.stderr)

        k = Knowledge()
        k.update_word_lists(P)  # nouns, subSecAdj, etc.
        k.update_word_lists(H)
        # k.update_modifier()

        assign_all_relations_wordnet(k)
Пример #6
0
def generate(chunk):
    n_rep = 3

    trees = CCGtrees("sick_uniq.raw.tok.preprocess.log")
    trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt")

    if chunk == "trial": sick_ids = sick_ans.ids_trial_E_C
    # sick_ids = [1237]
    if chunk == "1": sick_ids = sick_ans.ids_train[:500]
    elif chunk == "2": sick_ids = sick_ans.ids_train[500:1000]
    elif chunk == "3": sick_ids = sick_ans.ids_train[1000:1500]
    elif chunk == "4": sick_ids = sick_ans.ids_train[1500:2000]
    elif chunk == "5": sick_ids = sick_ans.ids_train[2000:2500]
    elif chunk == "6": sick_ids = sick_ans.ids_train[2500:3000]
    elif chunk == "7": sick_ids = sick_ans.ids_train[3000:3500]
    elif chunk == "8": sick_ids = sick_ans.ids_train[3500:4000]
    elif chunk == "9": sick_ids = sick_ans.ids_train[4000:4500]
    # elif chunk == 10: sick_ids = sick_ans.ids_train[4500:]

    for id_to_solve in sick_ids:
        P = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=True)
        H = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=True)

        # -----------------------------
        # passive to active
        # my_act = p2a.pass2act(P.printSent_raw_no_pol(stream=sys.stdout, verbose=False))
        # my_act = my_act.rstrip('. ')
        # eprint('original:', P.printSent_raw_no_pol(stream=sys.stdout, verbose=False))
        # eprint('active:', my_act)

        # -----------------------------
        # initialize s
        s = SentenceBase(gen_inf=True)

        # -----------------------------
        # build knowledge
        k = Knowledge()
        k.build_manual_for_sick()  # TODO
        k.build_quantifier(
            all_quant=False)  # all = every = each < some = a = an, etc.
        k.build_morph_tense()  # man = men, etc.

        # fix trees and update knowledge k, sentBase s
        # P
        P.fixQuantifier()
        P.fixNot()
        k.update_sent_pattern(P)  # patterns like: every X is NP
        k.update_word_lists(P)  # nouns, subSecAdj, etc.
        s.add_P_str(P)

        # H
        H.fixQuantifier()
        H.fixNot()
        k.update_word_lists(H)  # need to find nouns, subSecAdjs, etc. in H
        s.add_H_str_there_be(H)  # transform ``there be'' in H

        k.update_modifier()  # adj + n < n, n + RC/PP < n, v + PP < v

        s.k = k

        # -----------------------------
        # polarize
        eprint("\n*** polarizing ***\n")
        for p in s.Ps_ccgtree:  # Ps
            try:
                p.mark()
                p.polarize()
                p.getImpSign()
            except (ErrorCCGtree, ErrorCompareSemCat) as e:  # , AssertionError
                eprint("cannot polarize:", p.wholeStr)
                eprint("error:", e)
            except AssertionError as e:
                eprint("assertion error!")
                eprint(e)
                p.printSent()
                exit()
            eprint("P: ", end="")
            p.printSent_raw_no_pol(stream=sys.stderr)
        eprint("H: ", end="")
        s.H_ccgtree.printSent_raw_no_pol(stream=sys.stderr)

        eprint("\n*** replacement ***\n")
        try:
            ans = s.solve_ids(depth_max=n_rep, fracas_id=None)
        except (ErrorCompareSemCat, ErrorCCGtree):
            continue

        eprint("\n--- tried ** {} ** inferences:".format(len(s.inferences)))
        # for inf in sorted(s.inferences): print(inf)
        for inf in s.inferences_tree:
            print("{}\t{}\t{}\t{}".format(
                id_to_solve, P.printSent_no_pol(stream=sys.stdout),
                inf.printSent_no_pol(stream=sys.stdout), "ENTAILMENT"))

        eprint("\n--- tried ** {} ** contradictions:".format(len(
            s.contras_str)))
        for contra in sorted(s.contras_str):
            print("{}\t{}\t{}\t{}".format(
                id_to_solve, P.printSent_no_pol(stream=sys.stdout),
                contra.lower(), "CONTRADICTION"))
Пример #7
0
def convert2transccg(filename, parser, filename_log):
    """
    input: 
    - easyccg output (tmp.easyccg.parsed.txt) or 
    - candc output (tmp.candc.parsed.xml)

    input is read into my CCGtree format. 

    then traverse the tree and print xml to stdout

    return # of sents not polarized
    """
    trees = CCGtrees(filename_log)

    if parser == 'easyccg':
        trees.readEasyccgStr(filename)  #('tmp.easyccg.parsed.txt')
        raw_sentences = open(
            filename.replace(".easyccg.parsed.txt", "") +
            ".tok.clean").readlines()
    elif parser == 'candc':
        trees.readCandCxml(filename)  #('tmp.candc.parsed.xml')
        raw_sentences = open(
            filename.replace(".candc.parsed.xml", "") +
            ".tok.clean").readlines()
    elif parser == 'depccg':  # same as easyccg
        trees.readEasyccgStr(filename)
        raw_sentences = open(
            filename.replace(".depccg.parsed.txt", "") +
            ".tok.clean").readlines()
    else:
        eprint('parser can only be: easyccg, candc, depccg')
        exit()

    # ----------------------------------
    # mark and polarize
    N_polar = 0
    N_unpolar = 0
    N_unparsed = 0
    fh_polarized_trees = open(filename + ".polarized", "w")

    # sent_parsed = True

    # idx_cant_polarize = {}
    for idx in range(len(raw_sentences)):
        # build the tree here
        t = trees.build_one_tree(idx, parser, use_lemma=False)
        # eprint(trees.easyccg_str.get(idx, None))
        # print(t)
        # return

        if t in ["failed_to_parse",
                 "parse_exception"]:  # easyccg failed to parse the sent
            eprint('easyccg failed to parse the sent')
            eprint(raw_sentences[idx])
            sent = raw_sentences[idx].replace(" ", "= ").replace(
                "\n", "=\n")  # = for every token
            fh_polarized_trees.write(sent)
            N_unparsed += 1

        else:  # t is a tree
            # fix tree
            t.fixQuantifier()
            try:
                t.fixNot()
            except AttributeError:
                pass
            if parser in ['candc']: t.fixRC()  # only fix RC for candc

            try:
                t.mark()
                t.polarize()
                t.getImpSign()
                N_polar += 1
            except (ErrorCompareSemCat, ErrorCCGtree, AssertionError,
                    AttributeError, ErrorCat) as e:
                eprint(e)
                eprint('-- cannot polarize sent: ', end='')
                N_unpolar += 1
            # t.printSent(stream=sys.stderr)
            fh_polarized_trees.write(t.printSent_raw(stream=sys.stderr))
            fh_polarized_trees.write("\n")
        eprint()
    fh_polarized_trees.close()
    eprint("\n\n===========\npolarized {} trees\n"
           "unable to parse {} trees\n"
           "unable to polarize {} trees".format(N_polar, N_unparsed,
                                                N_unpolar))

    # ----------------------------------

    print(
        """<?xml version='1.0' encoding='UTF-8'?>\n<root>\n<document>\n<sentences>"""
    )
    for idx, t in trees.trees.items():
        if t in ["failed_to_parse", "parse_exception"]:
            continue

        print("<sentence>")

        # ----------------------------
        # print tokens
        print("<tokens>")
        counter = 0
        for token in t.leafNodes:
            # depth,cat,chunk,entity,lemma,pos,span,start,word
            token_id = "t" + str(idx) + '_' + str(counter)
            ETtype = token.cat.semCat.__str__()
            polarity = getPolarityAsArrow(token)
            print(
                str_token.format(token.start, token.span, token.pos,
                                 token.chunk, token.entity,
                                 token.cat.originalType, token_id, token.word,
                                 token.lemma, ETtype, polarity))
            counter += 1
        print("</tokens>")

        # ----------------------------
        # print nodes
        # <ccg root="s0_sp0" id="s0_ccg0">
        print('<ccg root="s{}_sp0" id="s{}_ccg0">'.format(str(idx), str(idx)))

        # tree
        # in-order traversal of tree to get span_id of non term node
        traverse2get_span_id(t.root, -1, idx)

        # in order traversal of tree
        traverse(t.root, 0, idx)

        print("</ccg>")
        print("</sentence>")

    print("""</sentences>\n</document>\n</root>""")