示例#1
0
    def quantifier_x_VP(self, sent_ccgtree):
        """  add ``quant (N, VP)'' to logical_sents """
        # match structure:
        # most    A
        # NP/N    N               regular VP
        # --------------     ----------------
        #      NP_1                S\NP  =  VP
        #
        node_most = sent_ccgtree.getLeftMostLeaf(sent_ccgtree.root)
        try:
            assert node_most.wholeStr in QUANTIFIERS
        except AssertionError:
            print("something wrong adding sent to logicalSys:")
            sent_ccgtree.printSent()
            exit()

        quant = node_most.wholeStr
        N, VP = None, None
        if node_most.sisters: N = node_most.sisters[0]
        if node_most.parent.sisters: VP = node_most.parent.sisters[0]
        if VP and VP.cat.typeWOfeats == r"S\NP":
            if N:
                log_sent = LogicalSentence(quant, N, VP)
                self.add_logical_sent_helper(log_sent)
            else:
                eprint('\ndid not add to logical sys 2.1:')
                sent_ccgtree.printSent()
        else:
            eprint('\ndid not add to logical sys 2.2:')
            sent_ccgtree.printSent()
示例#2
0
def main():
    if '-s' in sys.argv:
        save_cache()
    elif '-t' in sys.argv:
        test()
    else:
        eprint('required arg: -s/-t')
示例#3
0
def subst(line, quantifier, fh_log, s_pattern, sent_id, verbose=False):
    """ substitute: word = most/least """
    # find the index of `at' in line.split()
    idx_at = -1
    line_list = line.split()
    num_tokens = len(line_list)
    for idx, word in enumerate(line_list):
        if idx == num_tokens - 2: break
        if word.lower() == 'at' and \
                line_list[idx + 1].lower() == quantifier:  # 'most':
            idx_at = idx
            break

    # replace `at most 10' with `no'
    m = pat[quantifier].search(line)
    if m: original = m.group(0)
    else: return line
    eprint('original:', original)
    if quantifier == 'most': after = "no"
    elif quantifier == 'least': after = "some"
    line = pat[quantifier].sub(after, line)

    # log
    fh_log.write(
        s_pattern.format(str(sent_id), original, after, str(idx_at),
                         len(line_list)))

    return line
示例#4
0
def main():
    if len(sys.argv) < 4:
        eprint(message)
    else:
        filename = sys.argv[1]
        parser = sys.argv[2]
        filename_log = sys.argv[3]
        convert2transccg(filename, parser, filename_log)
示例#5
0
    def quantifier_x_be_y(self, sent_ccgtree):
        """  add ``quant (N, NP_2)'' to logical_sents """
        # TODO how to know if this "BE" is the main verb
        # "BE" can be at multiple places
        # match structure:
        # most    A           are        B
        # NP/N    N          (S\NP)/NP   NP_2 = X
        # --------------     ----------------
        #      NP_1                S\NP  =  VP
        #
        node_most = sent_ccgtree.getLeftMostLeaf(sent_ccgtree.root)
        try:
            assert node_most.wholeStr in QUANTIFIERS
        except AssertionError:
            print("something wrong adding sent to logicalSys:")
            sent_ccgtree.printSent()
            exit()

        quant = node_most.wholeStr
        N, X, VP = None, None, None
        if node_most.sisters: N = node_most.sisters[0]
        if node_most.parent.sisters: VP = node_most.parent.sisters[0]
        if VP and (VP.cat.typeWOfeats == r"S\NP") and \
                VP.children and (VP.children[0].wholeStr == "BE"):
            # case a: X = plural nouns
            # case b: X = a person? we only need ``person''?
            # case c: Every person is great. be: (S\NP)/(S\NP), great: S\NP
            # case d: Every one is good at dancing. same as above
            # case e: Every one is about to dance. same as above
            # case f: Several committee members are from Scandinavia. PP

            # case b
            if len(VP.children[1].children) == 2 and \
                            VP.children[1].children[0].wholeStr in ["A", "AN"]:
                X = VP.children[1].children[1]

            # case a,c,d,e,f
            elif VP.children[1].cat.typeWOfeats in {"NP", r"S\NP", "PP"}:
                X = VP.children[1]

            else:
                X = VP.children[1]

            if N and X:
                log_sent = LogicalSentence(quant, N, X)
                self.add_logical_sent_helper(log_sent)
            else:
                eprint('\ndid not add to logical sys 1:')
                sent_ccgtree.printSent()

        else:  # "BE" is not the main verb!
            self.quantifier_x_VP(sent_ccgtree)
示例#6
0
def main():
    USAGE = """
    generate inferences for SICK
    usage:
        python generate.py chunk_id
        chunk_id={trial, 1-9}
    """
    if len(sys.argv) != 2:
        print(USAGE)
        exit()
    elif sys.argv[1] not in ['trial'] + list(range(1, 10)):
        print(USAGE)
        exit()
    start_time = time.time()
    generate(sys.argv[1])
    eprint("\n\n--- %s seconds ---" % (time.time() - start_time))
示例#7
0
def preprocess(fn):
    """ produce a clean file named: test.tok.clean
    and log file: test.tok.preprocess.log """
    sent_id = -1
    fh_log = open(fn + '.preprocess.log', 'w')
    fh_log.write("sentId,before,after,idx,len_sent\n")
    fh_clean = open(fn + '.clean', 'w')
    s_pattern = "{},{},{},{},{}\n"
    # p2a = P2A_transformer(spacy.load('en'))
    # corenlp = StanfordCoreNLP('http://localhost', port=9000, lang='en')

    eprint('\npreprocessing...')
    with open(fn) as f:
        for line in f:
            line = line.strip()
            if line == "": continue

            sent_id += 1
            # print('\npreprocessing:', sent_id)

            # line = line.lower()

            eprint('\nbefore :', line)

            # with passitve to active transformation
            # line = preprocess_line(line, fh_log, s_pattern, sent_id, p2a, corenlp)

            # no passive to active transformation
            line = preprocess_line(line, fh_log, s_pattern, sent_id)
            eprint('after :', line)

            # write to clean file
            fh_clean.write(line)
            fh_clean.write('\n')

    fh_log.close()
    fh_clean.close()
    eprint('...done!\n')
示例#8
0
def save_cache():
    """
    for each noun in train[:500] + trials
    save the 5 most frequent hypers and hypos
    """
    sick_ids = sick_ans.ids_trial_E[:10]  # ids_trial_E_C
    sick_ids = [
        211, 1412, 1495, 2557, 2829, 2898, 3250, 4015, 4066, 4135, 4342, 4360,
        4661, 4916, 5030, 5113, 5498, 5806, 6756
    ]
    # 4916=guitar, musical instrument
    # 4006 throw, hurl
    # 6756 large big
    sick_ids = [5498, 5806, 4916, 4066, 6756]  # got all these

    trees = CCGtrees("sick_uniq.raw.tok.preprocess.log")
    trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt")

    for id_to_solve in sick_ids:
        eprint("-" * 50)
        print("sick", id_to_solve)
        P = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=False)
        H = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=False)
        eprint("P:", end="")
        P.printSent_raw_no_pol(stream=sys.stderr)
        eprint("H:", end="")
        H.printSent_raw_no_pol(stream=sys.stderr)

        k = Knowledge()
        k.update_word_lists(P)  # nouns, subSecAdj, etc.
        k.update_word_lists(H)
        # k.update_modifier()

        assign_all_relations_wordnet(k)
示例#9
0
def generate(chunk):
    n_rep = 3

    trees = CCGtrees("sick_uniq.raw.tok.preprocess.log")
    trees.readEasyccgStr("sick_uniq.raw.easyccg.parsed.txt")

    if chunk == "trial": sick_ids = sick_ans.ids_trial_E_C
    # sick_ids = [1237]
    if chunk == "1": sick_ids = sick_ans.ids_train[:500]
    elif chunk == "2": sick_ids = sick_ans.ids_train[500:1000]
    elif chunk == "3": sick_ids = sick_ans.ids_train[1000:1500]
    elif chunk == "4": sick_ids = sick_ans.ids_train[1500:2000]
    elif chunk == "5": sick_ids = sick_ans.ids_train[2000:2500]
    elif chunk == "6": sick_ids = sick_ans.ids_train[2500:3000]
    elif chunk == "7": sick_ids = sick_ans.ids_train[3000:3500]
    elif chunk == "8": sick_ids = sick_ans.ids_train[3500:4000]
    elif chunk == "9": sick_ids = sick_ans.ids_train[4000:4500]
    # elif chunk == 10: sick_ids = sick_ans.ids_train[4500:]

    for id_to_solve in sick_ids:
        P = trees.build_one_tree(P_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=True)
        H = trees.build_one_tree(H_idx(sick_ans.sick2uniq, id_to_solve),
                                 "easyccg",
                                 use_lemma=True)

        # -----------------------------
        # passive to active
        # my_act = p2a.pass2act(P.printSent_raw_no_pol(stream=sys.stdout, verbose=False))
        # my_act = my_act.rstrip('. ')
        # eprint('original:', P.printSent_raw_no_pol(stream=sys.stdout, verbose=False))
        # eprint('active:', my_act)

        # -----------------------------
        # initialize s
        s = SentenceBase(gen_inf=True)

        # -----------------------------
        # build knowledge
        k = Knowledge()
        k.build_manual_for_sick()  # TODO
        k.build_quantifier(
            all_quant=False)  # all = every = each < some = a = an, etc.
        k.build_morph_tense()  # man = men, etc.

        # fix trees and update knowledge k, sentBase s
        # P
        P.fixQuantifier()
        P.fixNot()
        k.update_sent_pattern(P)  # patterns like: every X is NP
        k.update_word_lists(P)  # nouns, subSecAdj, etc.
        s.add_P_str(P)

        # H
        H.fixQuantifier()
        H.fixNot()
        k.update_word_lists(H)  # need to find nouns, subSecAdjs, etc. in H
        s.add_H_str_there_be(H)  # transform ``there be'' in H

        k.update_modifier()  # adj + n < n, n + RC/PP < n, v + PP < v

        s.k = k

        # -----------------------------
        # polarize
        eprint("\n*** polarizing ***\n")
        for p in s.Ps_ccgtree:  # Ps
            try:
                p.mark()
                p.polarize()
                p.getImpSign()
            except (ErrorCCGtree, ErrorCompareSemCat) as e:  # , AssertionError
                eprint("cannot polarize:", p.wholeStr)
                eprint("error:", e)
            except AssertionError as e:
                eprint("assertion error!")
                eprint(e)
                p.printSent()
                exit()
            eprint("P: ", end="")
            p.printSent_raw_no_pol(stream=sys.stderr)
        eprint("H: ", end="")
        s.H_ccgtree.printSent_raw_no_pol(stream=sys.stderr)

        eprint("\n*** replacement ***\n")
        try:
            ans = s.solve_ids(depth_max=n_rep, fracas_id=None)
        except (ErrorCompareSemCat, ErrorCCGtree):
            continue

        eprint("\n--- tried ** {} ** inferences:".format(len(s.inferences)))
        # for inf in sorted(s.inferences): print(inf)
        for inf in s.inferences_tree:
            print("{}\t{}\t{}\t{}".format(
                id_to_solve, P.printSent_no_pol(stream=sys.stdout),
                inf.printSent_no_pol(stream=sys.stdout), "ENTAILMENT"))

        eprint("\n--- tried ** {} ** contradictions:".format(len(
            s.contras_str)))
        for contra in sorted(s.contras_str):
            print("{}\t{}\t{}\t{}".format(
                id_to_solve, P.printSent_no_pol(stream=sys.stdout),
                contra.lower(), "CONTRADICTION"))
示例#10
0
def convert2transccg(filename, parser, filename_log):
    """
    input: 
    - easyccg output (tmp.easyccg.parsed.txt) or 
    - candc output (tmp.candc.parsed.xml)

    input is read into my CCGtree format. 

    then traverse the tree and print xml to stdout

    return # of sents not polarized
    """
    trees = CCGtrees(filename_log)

    if parser == 'easyccg':
        trees.readEasyccgStr(filename)  #('tmp.easyccg.parsed.txt')
        raw_sentences = open(
            filename.replace(".easyccg.parsed.txt", "") +
            ".tok.clean").readlines()
    elif parser == 'candc':
        trees.readCandCxml(filename)  #('tmp.candc.parsed.xml')
        raw_sentences = open(
            filename.replace(".candc.parsed.xml", "") +
            ".tok.clean").readlines()
    elif parser == 'depccg':  # same as easyccg
        trees.readEasyccgStr(filename)
        raw_sentences = open(
            filename.replace(".depccg.parsed.txt", "") +
            ".tok.clean").readlines()
    else:
        eprint('parser can only be: easyccg, candc, depccg')
        exit()

    # ----------------------------------
    # mark and polarize
    N_polar = 0
    N_unpolar = 0
    N_unparsed = 0
    fh_polarized_trees = open(filename + ".polarized", "w")

    # sent_parsed = True

    # idx_cant_polarize = {}
    for idx in range(len(raw_sentences)):
        # build the tree here
        t = trees.build_one_tree(idx, parser, use_lemma=False)
        # eprint(trees.easyccg_str.get(idx, None))
        # print(t)
        # return

        if t in ["failed_to_parse",
                 "parse_exception"]:  # easyccg failed to parse the sent
            eprint('easyccg failed to parse the sent')
            eprint(raw_sentences[idx])
            sent = raw_sentences[idx].replace(" ", "= ").replace(
                "\n", "=\n")  # = for every token
            fh_polarized_trees.write(sent)
            N_unparsed += 1

        else:  # t is a tree
            # fix tree
            t.fixQuantifier()
            try:
                t.fixNot()
            except AttributeError:
                pass
            if parser in ['candc']: t.fixRC()  # only fix RC for candc

            try:
                t.mark()
                t.polarize()
                t.getImpSign()
                N_polar += 1
            except (ErrorCompareSemCat, ErrorCCGtree, AssertionError,
                    AttributeError, ErrorCat) as e:
                eprint(e)
                eprint('-- cannot polarize sent: ', end='')
                N_unpolar += 1
            # t.printSent(stream=sys.stderr)
            fh_polarized_trees.write(t.printSent_raw(stream=sys.stderr))
            fh_polarized_trees.write("\n")
        eprint()
    fh_polarized_trees.close()
    eprint("\n\n===========\npolarized {} trees\n"
           "unable to parse {} trees\n"
           "unable to polarize {} trees".format(N_polar, N_unparsed,
                                                N_unpolar))

    # ----------------------------------

    print(
        """<?xml version='1.0' encoding='UTF-8'?>\n<root>\n<document>\n<sentences>"""
    )
    for idx, t in trees.trees.items():
        if t in ["failed_to_parse", "parse_exception"]:
            continue

        print("<sentence>")

        # ----------------------------
        # print tokens
        print("<tokens>")
        counter = 0
        for token in t.leafNodes:
            # depth,cat,chunk,entity,lemma,pos,span,start,word
            token_id = "t" + str(idx) + '_' + str(counter)
            ETtype = token.cat.semCat.__str__()
            polarity = getPolarityAsArrow(token)
            print(
                str_token.format(token.start, token.span, token.pos,
                                 token.chunk, token.entity,
                                 token.cat.originalType, token_id, token.word,
                                 token.lemma, ETtype, polarity))
            counter += 1
        print("</tokens>")

        # ----------------------------
        # print nodes
        # <ccg root="s0_sp0" id="s0_ccg0">
        print('<ccg root="s{}_sp0" id="s{}_ccg0">'.format(str(idx), str(idx)))

        # tree
        # in-order traversal of tree to get span_id of non term node
        traverse2get_span_id(t.root, -1, idx)

        # in order traversal of tree
        traverse(t.root, 0, idx)

        print("</ccg>")
        print("</sentence>")

    print("""</sentences>\n</document>\n</root>""")
示例#11
0
    def compute_DET_rule(self):
        """ apply DET_rule on all logical_sents
        return inferences to be added to the fringe!

        DET  x  y        All  x  z
        -------------------------  DET: several cases based on pos of y and z
              DET x (y ^ z)

        all possibilities of y ^ z are stored in a list, returned from conjoin_two_terms(y, z)
        """

        # step 1: find 'all/each/every x z'
        to_loop = [
            self.logical_sents['ALL']['log_sents'],
            self.logical_sents['EACH']['log_sents'],
            self.logical_sents['EVERY']['log_sents']
        ]
        for lst in to_loop:
            for log_sent in lst:  # log_sent: All x z
                x = log_sent.term1  # NonTermNode
                z = log_sent.term2

                # x and its equivalents, a set of str
                x_set = set(
                    [i.ccgtree.root
                     for i in self.KB.frags[x.wholeStr].equal] + [x])
                x_set_str = set([i.wholeStr for i in x_set])
                # print('\n\n')
                # print(x_set_str)

                # step 2: find DET x y
                for quant in QUANTIFIERS:
                    # see if x and its equivalents appear as term1 in logSent for this quant
                    if not any([
                            i in self.logical_sents[quant]['X']
                            for i in x_set_str
                    ]):
                        continue  # then skip the following
                    for log_sent2 in self.logical_sents[quant]['log_sents']:
                        # log_sent2: DET x y
                        # if log_sent2.term1 is same as x, or the equavalents of x
                        if log_sent2.term1.wholeStr in x_set_str:
                            # found y!! DET rule here!!
                            # order is taken care of in the function
                            y = log_sent2.term2
                            y_and_z_list = self.conjoin_two_terms(
                                y, z)  # a list of NonTermNode
                            for y_and_z in y_and_z_list:
                                print("\n\nquant", quant)
                                print("x", x.wholeStr)
                                print("y_and_z", y_and_z.wholeStr,
                                      y_and_z.cat.typeWOfeats)
                                # apart from adding ``quant x y_and_z'',
                                # we also need to add any quantifier greater or equal to quant
                                # e.g. every x y_and_z = all x y_and_z = each x y_and_z
                                if quant in QUANT_GEQ:
                                    new_quants_str = QUANT_GEQ[quant]
                                else:
                                    new_quants_str = [quant]

                                ### !!! KEY STEP !!! ### generate inf
                                for new_quant_str in new_quants_str:
                                    for new_x in x_set:
                                        inf_log_sent = LogicalSentence(
                                            quantifier=new_quant_str,
                                            term1=new_x,
                                            term2=y_and_z)
                                        inf_nat_sent = inf_log_sent.to_nat_sent(
                                        )
                                        eprint("# DET_rule add to fringe: ",
                                               end="")
                                        inf_nat_sent.printSent(sys.stderr)
                                        yield inf_nat_sent
示例#12
0
def find_head_noun(node_det, ccg_tree):
    """ find the head noun of a determiner in a tree """
    # - configuration 1: the brown bulldog who every brown mammal moved-towards
    #            bulldog   RC
    #           ---------------
    #               N       N\N
    #       brown -------------
    #         N/N       N
    #  the   --------------
    #   NP/N       N = sister
    # - configuration 2: the brown dog
    # - configuration 3: the dog
    # - configuration 4: a good poodle who was happy waltzed

    # ----------------------------------------------
    # solution 1: find the first leafnode that is of category N, starting
    # from the sister of the determiner
    # -- this does NOT work for configuration 4, b/c (good poodle) and (RC)
    #    both have two children

    # sister = node_det.sisters[0]
    # head_noun = sister
    # while not head_noun_test(head_noun):
    #     old_head_noun = head_noun
    #     if len(head_noun.children) == 0:
    #         eprint("something wrong finding head noun, no children for {}".format(head_noun))
    #         eprint(ccg_tree.tree_str())
    #         exit()
    #     for child in head_noun.children:
    #         if head_noun_test(child):
    #             head_noun = child  # found head_noun
    #             break
    #         if len(child.children) != 0:
    #             head_noun = child  # next, explore the child which has children
    #     if old_head_noun == head_noun:
    #         eprint("\n!! something wrong finding head noun")
    #         eprint("old head noun:", old_head_noun)
    #         eprint("new head noun:", head_noun)
    #         eprint(ccg_tree.tree_str())
    #         ccg_tree.printTree(stream=sys.stderr)
    #         eprint("\n")
    #         exit()
    # if head_noun is None:
    #     eprint("head noun for det **{}** is None".format(node_det))
    #     eprint(ccg_tree.tree_str())
    #     exit()

    # ------------------------------
    # solution 2: find the *highest* leafnode (N) under the NP
    depth_min = -1
    head_noun = None
    nodes_under_NP = [node_det.sisters[0]]
    while nodes_under_NP:
        poped_node = nodes_under_NP.pop()
        if poped_node.children:  # non term node
            nodes_under_NP.extend(poped_node.children)
        else:  # leaf node
            if poped_node.cat.typeWOfeats == "N" and poped_node.depth > depth_min:
                head_noun = poped_node

    if head_noun is None:
        eprint("\n!! something wrong finding head noun")
        eprint(ccg_tree.tree_str())
        ccg_tree.printTree(stream=sys.stderr)
        eprint("\n")
        exit()

    return head_noun