Пример #1
0
def get_leaf_transitions():
    file_name = 'penn_leaf_transition_counts.data'

    try:
        f = open(os.path.join('cache', file_name), 'rb')
        data = pickle.load(f)
        f.close()
        return data
    except (IOError, EOFError):
        from tag_utils import is_valid_tag
        cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1)
        f = open(os.path.join('cache', file_name), 'wb')

        for sentence in nltk.corpus.treebank.parsed_sents():
            leaves = list(
                sentence.subtrees(
                    lambda x: len(x) > 0 and isinstance(x[0], basestring)))
            leaves = [
                n[0].node.split("-")[0] for n in leaves
                if n.node not in is_valid_tag(n[0].node)
            ]
            leaves = ['START'] + leaves

        cmd_utils.log("Finished building tag counts", 1)
        pickle.dump(store_transitions._counts, f)
        f.close()
        return store_transitions._counts
Пример #2
0
def parse(text):
    log("Checking for coherence in '{0}'".format(text), 2)

    family_hits = []
    family_stem_words = stemmed_words(family_words, 'family_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        family_hits += [
            (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower())
             in family_stem_words)
            for a_tree in tree.subtrees(lambda x: x.node in noun_tags)
        ]
    log("Family hits: {0}".format(family_hits), 4)
    family_hit_values = (len([hit for hit in family_hits
                              if hit[2]]), len(family_hits))
    log("%d/%d" % family_hit_values, 3)

    work_hits = []
    work_stem_words = stemmed_words(work_words, 'work_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        work_hits += [
            (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower())
             in work_stem_words)
            for a_tree in tree.subtrees(lambda x: x.node in noun_tags)
        ]
    log("Work hits: {0}".format(work_hits), 4)
    work_hit_values = (len([hit for hit in work_hits
                            if hit[2]]), len(work_hits))
    log("%d/%d" % work_hit_values, 3)

    return family_hit_values[0], work_hit_values[0], work_hit_values[1]
Пример #3
0
def boost_for_sentence_tree(tree):
    weight = 1

    first_np = list(tree.subtrees(lambda x: x.node == "NP"))[0]
    has_pro = len(list(first_np.subtrees(lambda x: x.node in pers_pro_tags))) > 0
    if has_pro:
        log("BOOST: Starts with Pers Pronouns", 2)
        weight *= start_pers_pro_weight

    # @NOTE TOGGLE POINT
    # if tree[0].node == "S":
    #     weight *= 10

    return weight
Пример #4
0
def find_commanding_verb_tree(tree, steps=0):
    log("looking for verb at root: %s" % (tree.node,), 3)
    if tree.node in verb_tags:
        return (tree, steps)
    else:
        parent_node = tree.parent()
        if not parent_node:
            return None
        else:
            for sibling in parent_node:
                if sibling.node in verb_tags:
                    return (sibling, steps + 1)
                elif sibling.node == "VP":
                    return (list(sibling.subtrees(lambda x: x.node in verb_tags))[0], steps + 2)
            return find_commanding_verb_tree(parent_node, steps + 1)
Пример #5
0
def boost_for_sentence_tree(tree):
    weight = 1

    first_np = list(tree.subtrees(lambda x: x.node == "NP"))[0]
    has_pro = len(list(
        first_np.subtrees(lambda x: x.node in pers_pro_tags))) > 0
    if has_pro:
        log("BOOST: Starts with Pers Pronouns", 2)
        weight *= start_pers_pro_weight

    # @NOTE TOGGLE POINT
    # if tree[0].node == "S":
    #     weight *= 10

    return weight
Пример #6
0
def get_treebank_rules(cutoff=0, include_counts=False):
    all_rules = cache_utils.cache_get('treebank_rules', 'rules')
    if not all_rules:
        log('Generating lexical rules from Penn Treebank', 4)
        from nltk.corpus import treebank
        all_rules = dict()
        for tree in treebank.parsed_sents():
            for rule, count in lexical_rules(tree).items():
                all_rules[rule] = all_rules.get(rule, 0) + count

        cache_utils.cache_set('treebank_rules', 'rules', all_rules)

    if include_counts:
        return {k: v for (k, v) in all_rules.items() if v > cutoff}
    else:
        rules_set = set([rule for rule, count in all_rules.items() if count > cutoff])
        return rules_set
Пример #7
0
def find_commanding_verb_tree(tree, steps=0):
    log("looking for verb at root: %s" % (tree.node, ), 3)
    if tree.node in verb_tags:
        return (tree, steps)
    else:
        parent_node = tree.parent()
        if not parent_node:
            return None
        else:
            for sibling in parent_node:
                if sibling.node in verb_tags:
                    return (sibling, steps + 1)
                elif sibling.node == "VP":
                    return (list(
                        sibling.subtrees(lambda x: x.node in verb_tags))[0],
                            steps + 2)
            return find_commanding_verb_tree(parent_node, steps + 1)
Пример #8
0
def get_treebank_rules(cutoff=0, include_counts=False):
    all_rules = cache_utils.cache_get('treebank_rules', 'rules')
    if not all_rules:
        log('Generating lexical rules from Penn Treebank', 4)
        from nltk.corpus import treebank
        all_rules = dict()
        for tree in treebank.parsed_sents():
            for rule, count in lexical_rules(tree).items():
                all_rules[rule] = all_rules.get(rule, 0) + count

        cache_utils.cache_set('treebank_rules', 'rules', all_rules)

    if include_counts:
        return {k: v for (k, v) in all_rules.items() if v > cutoff}
    else:
        rules_set = set(
            [rule for rule, count in all_rules.items() if count > cutoff])
        return rules_set
Пример #9
0
def parse(text):
    treebank_rules = get_treebank_rules(cutoff=0)

    sentence_probs = []
    for line in text.split("\n"):
        sentences = sentence_tokenizer.parse(line)

        for sentence in sentences:

            # Add a period to the end of the sentence, which sometimes
            # forces a better parse
            #if sentence[-1] not in ('.', '!', '?'):
            #                    sentence += '.'

            parse_trees = parsers.parse(sentence)
            for tree in parse_trees:
                if cmd_utils.cmd_log_level() > 2:
                    print tree.pprint()

                evindenced_lexical_rules = set(lexical_rules(tree).keys())
                differences = evindenced_lexical_rules.difference(
                    treebank_rules)

                bad_generations = len(differences)
                log(
                    "Found {0} bad generations ({1})".format(
                        bad_generations, differences), 3)

                #bad_parse_prob = 1 if prob == 0 else 0
                #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3)

                bad_tag_problems = num_tag_problems(tree)
                log("Found {0} X or FRAG tags".format(bad_tag_problems), 3)

                bad_sbar_problems = num_sbar_problems(tree)
                log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3)

                total_problems = bad_sbar_problems + bad_tag_problems + bad_generations
                log("In '{0}'".format(sentence), 2)
                log(
                    "Found {0} sentence formation problems".format(
                        total_problems), 1)
                sentence_probs.append(total_problems)
    return sentence_probs
Пример #10
0
def cache_get(cache_name, cache_key):
    if cache_name not in mem_caches:

        file_name = cache_name + '.data'
        file_path = os.path.join('cache', file_name)
        file_mode = "rb" if os.path.isfile(file_path) else "wb"
        f = open(file_path, file_mode)
        try:
            data = pickle.load(f)
        except (IOError, EOFError):
            data = dict()
        mem_caches[cache_name] = data
        f.close()

    try:
        rs = mem_caches[cache_name][cache_key]
        log('Cache Hit: %s[%s]' % (cache_name, cache_key), 5)
        return rs
    except KeyError:
        return None
Пример #11
0
def cache_get(cache_name, cache_key):
    if cache_name not in mem_caches:

        file_name = cache_name + '.data'
        file_path = os.path.join('cache', file_name)
        file_mode = "rb" if os.path.isfile(file_path) else "wb"
        f = open(file_path, file_mode)
        try:
            data = pickle.load(f)
        except (IOError, EOFError):
            data = dict()
        mem_caches[cache_name] = data
        f.close()

    try:
        rs = mem_caches[cache_name][cache_key]
        log('Cache Hit: %s[%s]' % (cache_name, cache_key), 5)
        return rs
    except KeyError:
        return None
Пример #12
0
def parse(text):
    treebank_rules = get_treebank_rules(cutoff=0)

    sentence_probs = []
    for line in text.split("\n"):
        sentences = sentence_tokenizer.parse(line)

        for sentence in sentences:

            # Add a period to the end of the sentence, which sometimes
            # forces a better parse
            #if sentence[-1] not in ('.', '!', '?'):
#                    sentence += '.'

            parse_trees = parsers.parse(sentence)
            for tree in parse_trees:
                if cmd_utils.cmd_log_level() > 2:
                    print tree.pprint()

                evindenced_lexical_rules = set(lexical_rules(tree).keys())
                differences = evindenced_lexical_rules.difference(treebank_rules)

                bad_generations = len(differences)
                log("Found {0} bad generations ({1})".format(bad_generations, differences), 3)

                #bad_parse_prob = 1 if prob == 0 else 0
                #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3)

                bad_tag_problems = num_tag_problems(tree)
                log("Found {0} X or FRAG tags".format(bad_tag_problems), 3)


                bad_sbar_problems = num_sbar_problems(tree)
                log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3)

                total_problems = bad_sbar_problems + bad_tag_problems + bad_generations
                log("In '{0}'".format(sentence), 2)
                log("Found {0} sentence formation problems".format(total_problems), 1)
                sentence_probs.append(total_problems)
    return sentence_probs
Пример #13
0
def get_leaf_transitions():
    file_name = 'penn_leaf_transition_counts.data'

    try:
        f = open(os.path.join('cache', file_name), 'rb')
        data = pickle.load(f)
        f.close()
        return data
    except (IOError, EOFError):
        from tag_utils import is_valid_tag
        cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1)
        f = open(os.path.join('cache', file_name), 'wb')

        for sentence in nltk.corpus.treebank.parsed_sents():
            leaves = list(sentence.subtrees(lambda x: len(x) > 0 and isinstance(x[0], basestring)))
            leaves = [n[0].node.split("-")[0] for n in leaves if n.node not in is_valid_tag(n[0].node)]
            leaves = ['START'] + leaves

        cmd_utils.log("Finished building tag counts", 1)
        pickle.dump(store_transitions._counts, f)
        f.close()
        return store_transitions._counts
Пример #14
0
def parse(text):
    log("Checking for coherence in '{0}'".format(text), 2)

    family_hits = []
    family_stem_words = stemmed_words(family_words, 'family_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        family_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in family_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)]
    log("Family hits: {0}".format(family_hits), 4)
    family_hit_values = (len([hit for hit in family_hits if hit[2]]), len(family_hits))
    log("%d/%d" % family_hit_values, 3)

    work_hits = []
    work_stem_words = stemmed_words(work_words, 'work_words')
    for sentence in sentence_tokenizer.parse(text):
        tree = parsers.parse(sentence)[0]
        work_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in work_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)]
    log("Work hits: {0}".format(work_hits), 4)
    work_hit_values = (len([hit for hit in work_hits if hit[2]]), len(work_hits))
    log("%d/%d" % work_hit_values, 3)

    return family_hit_values[0], work_hit_values[0], work_hit_values[1]
Пример #15
0
def get_transition_counts():
    file_name = 'penn_transition_counts.data'

    try:
        f = open(os.path.join('cache', file_name), 'rb')
        data = pickle.load(f)
        f.close()
        return data
    except (IOError, EOFError):
        cmd_utils.log("Building counts from Penn Treebank corpus", 1)
        f = open(os.path.join('cache', file_name), 'wb')

        for sentence in nltk.corpus.treebank.parsed_sents():
            all_transitions = tree_utils.transitions_in_tree(sentence)
            for transitions in all_transitions:
                transitions = ['START'] + transitions
                if len(transitions) > 1:
                    store_transitions(transitions)

        cmd_utils.log("Finished building tag counts", 1)
        pickle.dump(store_transitions._counts, f)
        f.close()
        return store_transitions._counts
Пример #16
0
def get_transition_counts():
    file_name = 'penn_transition_counts.data'

    try:
        f = open(os.path.join('cache', file_name), 'rb')
        data = pickle.load(f)
        f.close()
        return data
    except (IOError, EOFError):
        cmd_utils.log("Building counts from Penn Treebank corpus", 1)
        f = open(os.path.join('cache', file_name), 'wb')

        for sentence in nltk.corpus.treebank.parsed_sents():
            all_transitions = tree_utils.transitions_in_tree(sentence)
            for transitions in all_transitions:
                transitions = ['START'] + transitions
                if len(transitions) > 1:
                    store_transitions(transitions)

        cmd_utils.log("Finished building tag counts", 1)
        pickle.dump(store_transitions._counts, f)
        f.close()
        return store_transitions._counts
Пример #17
0
def grade_1b(text):
    import agreement_utils
    rs = agreement_utils.parse(text)
    num_agreements, num_non_agreements, num_unsure = rs
    num_agreements_tested = sum(rs)
    if num_agreements_tested == 0:
        log("No possible agreements found in text", 2)
        return 0
    else:
        log("Sub Scores: %s" % (rs, ), 2)
        prob = float(num_agreements) / sum(rs)
        log("%d/%d -> %f" % (num_agreements, sum(rs), prob), 2)
        return floor(prob * 5)
Пример #18
0
def grade_1b(text):
    import agreement_utils
    rs = agreement_utils.parse(text)
    num_agreements, num_non_agreements, num_unsure = rs
    num_agreements_tested = sum(rs)
    if num_agreements_tested == 0:
        log("No possible agreements found in text", 2)
        return 0
    else:
        log("Sub Scores: %s" % (rs,), 2)
        prob = float(num_agreements) / sum(rs)
        log("%d/%d -> %f" % (num_agreements, sum(rs), prob), 2)
        return floor(prob * 5)
Пример #19
0
    sentences = sentence_tokenizer.parse(text)
    num_sentences = len(sentences)
    if num_sentences >= 6:
        return 5
    else:
        return max(num_sentences - 1, 1)

if __name__ == '__main__':
    import cmd_utils

    tests = cmd_utils.cmd_test()
    tests = [tests] if tests else ('1a', '1b', '1d', '2a', '2b', '3a')
    essay_index = int(cmd_utils.cmd_arg('--essay', 0)) - 1

    for test in tests:
        if essay_index >= 0:
            essay_text = "\n".join(essay_utils.essays[essay_index])
            received_grade = grade_text(essay_text, test)
            log("Expect %s score: %d" % (test, correct_essay_grade(essay_index, test)), 0)
            log("Received %s score: %d" % (test, received_grade), 0)
        else:
            print "Values for %s" % (test,)
            print "-------------"
            for i in range(0, len(essay_utils.essays)):
                essay_text = "\n".join(essay_utils.essays[i])
                received_grade = grade_text(essay_text, test)
                expected_grade = correct_essay_grade(i, test)
                diff = received_grade - expected_grade
                print " | ".join([str(s) for s in [(i + 1), expected_grade, received_grade, diff, abs(diff)]])
            print "\n\n"
Пример #20
0
def issues_in_sentence(sentence, use_cache=True):
    """'Brute force' check for a bunch of possible word ordering issues.
    Specifically, looking for the following:
        - VP coming before NP in standard sentence
        - NP coming before VP in inverted sentence
        - JJ coming after Nount in NP
        - VB before PP in VP
        - VB before NP in VP
        - VP before S in standard sentence (with embedded sentences)
        - NN before CD in NP
        - NNP before CD in NP
    """
    if use_cache:
        result = cache_get('word_order_issues', sentence)
        if result is not None:
            return result

    tree = parsers.parse(sentence)[0]
    tree_utils.simplify_tree(tree, trim_adjecent_prop_nouns=True,
                             normalize_sent_roots=True,
                             normalize_plural=True,
                             normalize_case=True)

    log("Looking for order issues in: %s" % (sentence,), 1)
    if cmd_log_level() >= 4:
        print "Simplified Parse Tree"
        print tree

    problems = []
    problems += ["VP->NP in S"] * num_forbidden_orders(tree, ("S",), ('VP', 'NP'))
    problems += ["NP->VP in SINV"] * num_forbidden_orders(tree, ('SINV',), ('NP', 'VP'))
    problems += ["NN->JJ in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'JP'))

    problems += ["PP->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('PP', 'VB'))
    problems += ["NP->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('NP', 'VP'))

    problems += ["S->VP in S"] * num_forbidden_orders(tree, ('S',), ('S', 'VP'))

    problems += ["S->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('S', 'VB'))
    # problems += ["VB->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('VB', 'VP'))

    problems += ["NP->RBR in ADVP"] * num_forbidden_orders(tree, ('ADVP',), ('NP', 'RBR'))
    problems += ["NN->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'DT'))
    problems += ["NNP->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'DT'))
    problems += ["NN->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'CD'))
    problems += ["NNP->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'CD'))

    problems += ['PP->NP in S'] * num_forbidden_orders(tree, ('S',), ('PP', 'NP'))

    # Toggle?
    problems += ['NP->VP in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'VP'))

    # Seems like it should be VB->ADVP->PP
    problems += ['VB->PP->ADVP in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'ADVP'))
    problems += ['VB->PP->SBAR in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'SBAR'))

    problems += ['NP->S in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'S'))

    # Seems like the ADJP should be in a NP or somewhere else, not a sibling
    # of a noun phrase
    problems += ['NP->ADJP in S'] * num_forbidden_orders(tree, ('S',), ('NP', 'ADJP'))

    # Last, if there is an S w/ only one child, we call it a word order problem...
    problems += ['Single Child S'] * len(list(tree.subtrees(lambda x: x in tree_utils.semi_tree_roots and len(x) == 1)))

    if tree[0].node not in tree_utils.semi_tree_roots and not hasattr(tree[0], '_has_error'):
        tree[0]._has_error = True
        problems += ['No S Root']

    log("Found %d order issues" % (len(problems),), 1)
    log("Issues: %s", (problems,), 2)

    if use_cache:
        cache_set('word_order_issues', sentence, problems)

    return problems
Пример #21
0
        return max(num_sentences - 1, 1)


if __name__ == '__main__':
    import cmd_utils

    tests = cmd_utils.cmd_test()
    tests = [tests] if tests else ('1a', '1b', '1d', '2a', '2b', '3a')
    essay_index = int(cmd_utils.cmd_arg('--essay', 0)) - 1

    for test in tests:
        if essay_index >= 0:
            essay_text = "\n".join(essay_utils.essays[essay_index])
            received_grade = grade_text(essay_text, test)
            log(
                "Expect %s score: %d" %
                (test, correct_essay_grade(essay_index, test)), 0)
            log("Received %s score: %d" % (test, received_grade), 0)
        else:
            print "Values for %s" % (test, )
            print "-------------"
            for i in range(0, len(essay_utils.essays)):
                essay_text = "\n".join(essay_utils.essays[i])
                received_grade = grade_text(essay_text, test)
                expected_grade = correct_essay_grade(i, test)
                diff = received_grade - expected_grade
                print " | ".join([
                    str(s)
                    for s in [(i + 1), expected_grade, received_grade, diff,
                              abs(diff)]
                ])
Пример #22
0
def is_possible_sentence(tree):
    """Perform some basic filtering to remove unlikely constructs, like
    starting a setnence with because"""
    leaf_trees = tree.subtrees(lambda x: x.height() == 2)
    leaf_nodes = [n.node for n in leaf_trees]

    if leaf_nodes[0] in invalid_boundary_tags:
        log("Rejecting sentence becuase it starts with an invalid boundry tag: %s" % (leaf_nodes[0],), 3)
    elif leaf_nodes[-1] in invalid_boundary_tags:
        log("Rejecting sentence becuase it ends with an invalid boundry tag: %s" % (leaf_nodes[-1],), 3)
        return False
    elif leaf_nodes[0] == "PP":
        log("Rejecting sentence because it stats with PP", 3)
        return False
    else:
        flatten_tags = []
        useful_roots = list(tree.subtrees(lambda x: (x.node in semi_tree_roots) and len(x) > 1))

        if len(useful_roots) == 0 or len(useful_roots[0]) < 2:
            log("Rejecting sentence becuase can't find a useful root", 3)
            return False

        sub_tree = useful_roots[0]

        for sub_sub_tree in sub_tree:
            flatten_tags.append(tag_utils.simplify_tag(sub_sub_tree.node))

        sen_is_inverted = tree[0].node == "SINV"

        if sen_is_inverted:
            early_set = ("VP", "VB")
            late_set = ("NP",)
        else:
            early_set = ("NP", "NN")
            late_set = ("VP", "ADJP")

        try:
            earliest_index = min([flatten_tags.index(tag) for tag in early_set if tag in flatten_tags])
            latest_index = max([flatten_tags.index(tag) for tag in late_set if tag in flatten_tags])
            if earliest_index > latest_index:
                if sen_is_inverted:
                    log("Rejecting possible sentence because earliest NP like tag occurs before earliest VP like tag (%d vs %d)" % (earliest_index, latest_index), 3)
                else:
                    log("Rejecting possible sentence because earliest VP like tag occurs before earliest NP like tag (%d vs %d) and sentence parse SINV" % (earliest_index, latest_index), 3)
                return False
            else:
                return True
        except ValueError:
            log("Rejecting possible sentence because the head structure doesn't look like a valid parse", 3)
            return False
Пример #23
0
def parse_sentences(line, use_cache=True, include_prob=False):

    log("Working on: %s" % (line,), 2)

    if use_cache:
        correct_parse = cache_get("sentence_tokenizer", line)
        if correct_parse:
            log("Cache Hit: %s" % (correct_parse[0],), 4)
            log("-------------\n", 4)
            return correct_parse if include_prob else correct_parse[0]

    all_possible_sentences = _possible_sentences_in_line(line)
    all_possible_sentence_probs = []
    invalid_possible_sentences = []
    stored_probs = {}

    for possible_sentences in all_possible_sentences:

        log("Examining: %s" % (possible_sentences,), 1)
        prob_for_sentences = []
        sent_is_impossible = False

        for possible_sentence in possible_sentences:

            if use_cache:
                possible_sentence_prob = cache_get('possible_sentences', possible_sentence)
                if possible_sentence_prob is not None:
                    log("Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4)
                    prob_for_sentences.append(possible_sentence_prob)
                    continue

            if contains_any_invalid_setences(possible_sentences, invalid_possible_sentences) or sent_is_impossible:
                prob_for_sentences.append(0)
                continue
            elif possible_sentence in stored_probs:
                prob_for_sentences.append(stored_probs[possible_sentence])
                continue

            sentence_trees = parsers.parse(possible_sentence)
            if len(sentence_trees) == 0:
                log("Wasn't able to parse input %s" % (possible_sentence,), 0)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                continue
            else:
                sentence_tree = sentence_trees[0]

            if cmd_log_level() >= 4:
                print "--------"
                print "Pre Simplified Tree"
                print sentence_tree

            tree_utils.simplify_tree(sentence_tree,
                                     remove_starting_cc=possible_sentences.index(possible_sentence) == 0)

            if cmd_log_level() >= 4:
                print "--------"
                print "Post Simplified Tree"
                print sentence_tree

            sentence_transitions = tree_utils.transitions_in_tree(sentence_tree)

            if not is_possible_sentence(sentence_tree):
                log("%s" % (sentence_transitions,), 2)
                log("Invalid parse", 2)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, 0)
            else:
                log("%s" % (sentence_transitions,), 2)
                sentence_probs = []
                for transition in sentence_transitions:
                    try:
                        probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3)
                    except KeyError, e:
                        log("'Imposible' Tag order", 2, sep=' ** ')
                        log("%s" % (e,), 2, sep=' ** ')
                        probs = [0]
                    sentence_probs += probs
                    log("Transitions: %s" % (transition,), 3)
                    log("Probabilities: %s" % (probs,), 3)

                attempt_sentence_prob = prod(sentence_probs)

                sentence_prob_boost = boost_for_sentence_tree(sentence_tree)
                attempt_sentence_prob *= sentence_prob_boost

                prob_for_sentences.append(attempt_sentence_prob)
                stored_probs[possible_sentence] = attempt_sentence_prob
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, attempt_sentence_prob)
        weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1))
        if weighted_score > 0:
            log("Valid Parse: %s" % (possible_sentences,), 2)
            log(weighted_score, 2)

        all_possible_sentence_probs.append(weighted_score)
Пример #24
0
def parse_sentences(line, use_cache=True, include_prob=False):

    log("Working on: %s" % (line, ), 2)

    if use_cache:
        correct_parse = cache_get("sentence_tokenizer", line)
        if correct_parse:
            log("Cache Hit: %s" % (correct_parse[0], ), 4)
            log("-------------\n", 4)
            return correct_parse if include_prob else correct_parse[0]

    all_possible_sentences = _possible_sentences_in_line(line)
    all_possible_sentence_probs = []
    invalid_possible_sentences = []
    stored_probs = {}

    for possible_sentences in all_possible_sentences:

        log("Examining: %s" % (possible_sentences, ), 1)
        prob_for_sentences = []
        sent_is_impossible = False

        for possible_sentence in possible_sentences:

            if use_cache:
                possible_sentence_prob = cache_get('possible_sentences',
                                                   possible_sentence)
                if possible_sentence_prob is not None:
                    log(
                        "Cache Hit: %s (from %s)" %
                        (possible_sentence, 'possible sentences'), 4)
                    prob_for_sentences.append(possible_sentence_prob)
                    continue

            if contains_any_invalid_setences(
                    possible_sentences,
                    invalid_possible_sentences) or sent_is_impossible:
                prob_for_sentences.append(0)
                continue
            elif possible_sentence in stored_probs:
                prob_for_sentences.append(stored_probs[possible_sentence])
                continue

            sentence_trees = parsers.parse(possible_sentence)
            if len(sentence_trees) == 0:
                log("Wasn't able to parse input %s" % (possible_sentence, ), 0)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                continue
            else:
                sentence_tree = sentence_trees[0]

            if cmd_log_level() >= 4:
                print "--------"
                print "Pre Simplified Tree"
                print sentence_tree

            tree_utils.simplify_tree(
                sentence_tree,
                remove_starting_cc=possible_sentences.index(
                    possible_sentence) == 0)

            if cmd_log_level() >= 4:
                print "--------"
                print "Post Simplified Tree"
                print sentence_tree

            sentence_transitions = tree_utils.transitions_in_tree(
                sentence_tree)

            if not is_possible_sentence(sentence_tree):
                log("%s" % (sentence_transitions, ), 2)
                log("Invalid parse", 2)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, 0)
            else:
                log("%s" % (sentence_transitions, ), 2)
                sentence_probs = []
                for transition in sentence_transitions:
                    try:
                        probs = hmm_utils.prob_of_all_transitions(transition,
                                                                  counts,
                                                                  gram_size=3)
                    except KeyError, e:
                        log("'Imposible' Tag order", 2, sep=' ** ')
                        log("%s" % (e, ), 2, sep=' ** ')
                        probs = [0]
                    sentence_probs += probs
                    log("Transitions: %s" % (transition, ), 3)
                    log("Probabilities: %s" % (probs, ), 3)

                attempt_sentence_prob = prod(sentence_probs)

                sentence_prob_boost = boost_for_sentence_tree(sentence_tree)
                attempt_sentence_prob *= sentence_prob_boost

                prob_for_sentences.append(attempt_sentence_prob)
                stored_probs[possible_sentence] = attempt_sentence_prob
                if use_cache:
                    cache_set('possible_sentences', possible_sentence,
                              attempt_sentence_prob)
        weighted_score = prod(prob_for_sentences) * (weight**(
            len(possible_sentences) - 1))
        if weighted_score > 0:
            log("Valid Parse: %s" % (possible_sentences, ), 2)
            log(weighted_score, 2)

        all_possible_sentence_probs.append(weighted_score)
Пример #25
0
def parse(text, use_cache=True):
    num_agrees = 0
    num_not_agrees = 0
    num_unsure = 0

    lines = text.split("\n")
    for line in lines:
        sentences = sentence_tokenizer.parse(line, use_cache=use_cache)
        for sentence in sentences:

            line_agreements, line_non_agreements, line_unsure = 0, 0, 0

            # Possession seems to be tricky for the parser, so we fudge
            # a little here
            sentence = sentence.replace("'s", '')
            if sentence[-1] != ".":
                sentence += "."

            if use_cache:
                cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence)
                if cache_rs:
                    line_agreements, line_non_agreements, line_unsure = cache_rs
                    num_agrees += line_agreements
                    num_not_agrees += line_non_agreements
                    num_unsure += line_unsure
                    continue

            log("Looking for Sub-Verb agreement in '%s'" % (sentence,), 1)

            tree = parsers.parse(sentence)[0]
            dependencies = parsers.dependences(sentence)
            sub_verb_deps = [dep for dep in dependencies if dep['dep_name'] == 'nsubj']

            if len(sub_verb_deps) == 0:
                log("Couldn't find Subject-Verb dependency info", 1)
                cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0))
                continue

            for sub_verb in sub_verb_deps:
                first_node = node_in_tree(tree, sub_verb['first_word'])
                sec_node = node_in_tree(tree, sub_verb['second_word'])
                if first_node and sec_node:

                    log("First Dep Node: %s" % (first_node,), 2)
                    log("Sec Dep Node: %s" % (sec_node,), 2)

                    try:
                        is_agreement = check_node_agreement(first_node, sec_node)
                        if is_agreement:
                            line_agreements += 1
                        else:
                            line_non_agreements += 1
                        log("Agreement in sentence? %s" % (is_agreement,), 1)
                    except Exception as e:
                        line_unsure += 1
                        log("Error looking for agreement? %s" % (e.message,), 2)

                        # No agreement in pair.  Not sure how to handle.
                        # More exhaustive search?
            if use_cache:
                cache_utils.cache_set('sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure))
            num_agrees += line_agreements
            num_not_agrees += line_non_agreements
            num_unsure += line_unsure

    return num_agrees, num_not_agrees, num_unsure
Пример #26
0
def parse(text):
    # Strip numbers out, since that seems to cause problems for my approach
    text = re.sub(r'\d+ ?', 'some ', text)

    sentences = sentence_tokenizer.parse(text)
    sentence_pronouns = []

    for sentence in sentences:
        log("Looking for pronouns in '{0}'".format(sentence), 2)

        pronoun_totals = [[], [], []]
        tree = parsers.parse(sentence)[0]
        pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags)
        for pronoun_tree in pronoun_trees:
            # First total up all the first person pronouns
            for i in range(3):
                if pronoun_tree[0].lower() in pronouns[i]:
                    pronoun_totals[i].append(pronoun_tree[0])
        log("First Person '{0}'".format(pronoun_totals[0]), 3)
        log("Second Person '{0}'".format(pronoun_totals[1]), 3)
        log("Third Person '{0}'".format(pronoun_totals[2]), 3)
        sentence_pronouns.append(pronoun_totals)

    log("Pronouns found in text: %s" % (sentence_pronouns), 2)

    # If there are 3rd person pronouns in any sentence, we have to decide
    # if they are used correctly.  We do this in the following, very
    # expensive, but possibly correct manner.
    #
    # Start from the top down
    #   1. Look back 2 sentences and see if we can find a refernece.
    #       IF NOT - its an error and do no more
    #   2. If so, replace the refereneced word with "RUNNING"
    #      and search again, to see if there is a previous word it could refer
    #      to.
    #       IF NOT, its correct.  Replace the pronoun with the referenced word
    #       and continue
    #   3. Else, its not felicitous.  Give bad credit
    for i in range(len(sentences)):
        if len(sentence_pronouns[i][2]) > 0:
            pronoun_results = []
            for third_pronoun in sentence_pronouns[i][2]:
                all_sentences = sentences[max(0, i - 2):i + 1]
                norm_sentences = ". ".join(
                    [a_sen.strip(".") for a_sen in all_sentences]) + "."
                log(
                    "Looking for pronoun coherence for '{0}'".format(
                        norm_sentences), 4)
                pronouns_refs = parsers.parse_coref(norm_sentences)

                log("Recieved co-references {0}".format(pronouns_refs), 5)

                found_bundle = False

                for j in range(len(pronouns_refs)):
                    if third_pronoun == pronouns_refs[j]['pronoun']:
                        found_bundle = pronouns_refs[j]
                        break

                if not found_bundle:
                    log("Found NO anticedent for {0}".format(third_pronoun), 3)
                    pronoun_results.append((third_pronoun, -1))
                else:
                    log("Found anticedent for {0}".format(third_pronoun), 3)
                    ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2)

                    sentences[ref_index] = sentences[ref_index].replace(
                        found_bundle['ref'], 'RUNNING')
                    log(
                        "Replacing '{0}' with 'RUNNING'".format(
                            found_bundle['ref']), 3)

                    altered_sentences = sentences[max(0, i - 2):i + 1]
                    norm_altered_sentences = ". ".join(
                        [a_sen.strip(".")
                         for a_sen in altered_sentences]) + "."
                    log(
                        "New test sentences are '{0}'".format(
                            norm_altered_sentences), 4)
                    altered_pronouns_refs = parsers.parse_coref(
                        norm_altered_sentences)

                    if third_pronoun not in [
                            a_ref['pronoun'] for a_ref in altered_pronouns_refs
                    ]:
                        log("Anticedent is unambigious!", 3)

                        pro_index = int(
                            found_bundle['pronoun_sentence']) - 1 + (i - 2)
                        sentences[pro_index] = sentences[pro_index].replace(
                            found_bundle['pronoun'], found_bundle['ref'])

                        pronoun_results.append(
                            (third_pronoun, found_bundle['ref']))
                    else:
                        log("Anticedent is ambigious", 3)
                        log("New Sentences: {0}".format(altered_pronouns_refs),
                            4)
                        pronoun_results.append((third_pronoun, .5))
            sentence_pronouns[i][2] = pronoun_results
    return sentence_pronouns
Пример #27
0
def parse(text):
    # Strip numbers out, since that seems to cause problems for my approach
    text = re.sub(r'\d+ ?', 'some ', text)

    sentences = sentence_tokenizer.parse(text)
    sentence_pronouns = []

    for sentence in sentences:
        log("Looking for pronouns in '{0}'".format(sentence), 2)

        pronoun_totals = [[], [], []]
        tree = parsers.parse(sentence)[0]
        pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags)
        for pronoun_tree in pronoun_trees:
            # First total up all the first person pronouns
            for i in range(3):
                if pronoun_tree[0].lower() in pronouns[i]:
                    pronoun_totals[i].append(pronoun_tree[0])
        log("First Person '{0}'".format(pronoun_totals[0]), 3)
        log("Second Person '{0}'".format(pronoun_totals[1]), 3)
        log("Third Person '{0}'".format(pronoun_totals[2]), 3)
        sentence_pronouns.append(pronoun_totals)

    log("Pronouns found in text: %s" % (sentence_pronouns), 2)

    # If there are 3rd person pronouns in any sentence, we have to decide
    # if they are used correctly.  We do this in the following, very
    # expensive, but possibly correct manner.
    #
    # Start from the top down
    #   1. Look back 2 sentences and see if we can find a refernece.
    #       IF NOT - its an error and do no more
    #   2. If so, replace the refereneced word with "RUNNING"
    #      and search again, to see if there is a previous word it could refer
    #      to.
    #       IF NOT, its correct.  Replace the pronoun with the referenced word
    #       and continue
    #   3. Else, its not felicitous.  Give bad credit
    for i in range(len(sentences)):
        if len(sentence_pronouns[i][2]) > 0:
            pronoun_results = []
            for third_pronoun in sentence_pronouns[i][2]:
                all_sentences = sentences[max(0, i - 2):i + 1]
                norm_sentences = ". ".join([a_sen.strip(".") for a_sen in all_sentences]) + "."
                log("Looking for pronoun coherence for '{0}'".format(norm_sentences), 4)
                pronouns_refs = parsers.parse_coref(norm_sentences)

                log("Recieved co-references {0}".format(pronouns_refs), 5)

                found_bundle = False

                for j in range(len(pronouns_refs)):
                    if third_pronoun == pronouns_refs[j]['pronoun']:
                        found_bundle = pronouns_refs[j]
                        break

                if not found_bundle:
                    log("Found NO anticedent for {0}".format(third_pronoun), 3)
                    pronoun_results.append((third_pronoun, -1))
                else:
                    log("Found anticedent for {0}".format(third_pronoun), 3)
                    ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2)

                    sentences[ref_index] = sentences[ref_index].replace(found_bundle['ref'], 'RUNNING')
                    log("Replacing '{0}' with 'RUNNING'".format(found_bundle['ref']), 3)

                    altered_sentences = sentences[max(0, i - 2):i + 1]
                    norm_altered_sentences = ". ".join([a_sen.strip(".") for a_sen in altered_sentences]) + "."
                    log("New test sentences are '{0}'".format(norm_altered_sentences), 4)
                    altered_pronouns_refs = parsers.parse_coref(norm_altered_sentences)

                    if third_pronoun not in [a_ref['pronoun'] for a_ref in altered_pronouns_refs]:
                        log("Anticedent is unambigious!", 3)

                        pro_index = int(found_bundle['pronoun_sentence']) - 1 + (i - 2)
                        sentences[pro_index] = sentences[pro_index].replace(found_bundle['pronoun'], found_bundle['ref'])

                        pronoun_results.append((third_pronoun, found_bundle['ref']))
                    else:
                        log("Anticedent is ambigious", 3)
                        log("New Sentences: {0}".format(altered_pronouns_refs), 4)
                        pronoun_results.append((third_pronoun, .5))
            sentence_pronouns[i][2] = pronoun_results
    return sentence_pronouns
Пример #28
0
def simplify_tree(
    tree,
    remove_starting_cc=False,
    trim_adjecent_prop_nouns=False,
    normalize_sent_roots=False,
    normalize_case=False,
    normalize_plural=False,
    collapse_redundant_sbar=True,
):
    """Do some transformations on a parse tree to normalize it.  Currently:
        - Remove CD when they're paired with a noun
        - Stripping off conjunction from the beginning of the sentence / root
        of the tree
        - Remove proper nouns that are next to each other
    """
    if normalize_plural:
        plural_transforms = dict(NNS="NN", NNPS="NNP")
        for a_tree in tree.subtrees(lambda x: x.node in plural_transforms):
            a_tree.node = plural_transforms[a_tree.node]

    if normalize_case:
        case_transforms = dict(VBD="VB", VBG="VB", VBN="VB", VBP="VB", VBZ="VB")
        for a_tree in tree.subtrees(lambda x: x.node in case_transforms):
            a_tree.node = case_transforms[a_tree.node]

    if normalize_sent_roots:
        for a_tree in tree.subtrees(lambda x: x.node in semi_tree_roots):
            a_tree.node = "S"

    if trim_adjecent_prop_nouns:
        np_trees = list(tree.subtrees(lambda x: x.node == "NP"))
        if len(np_trees) > 0:
            for np_tree in np_trees:
                num_leaves = len(np_tree)
                change = False
                if num_leaves >= 2:
                    for i in range(0, num_leaves - 1):
                        if not change:
                            if np_tree[i].node == "NNP" and np_tree[i + 1].node == "NNP":
                                np_tree.remove(np_tree[i + 1])
                                change = True

    if remove_starting_cc:
        useful_roots = list(tree.subtrees(lambda x: x.node in semi_tree_roots))
        if len(useful_roots) > 0:
            useful_root = useful_roots[0]
            if useful_root[0].node == "CC":
                useful_root.remove(useful_root[0])
                log("REMOVED CC from start of sentence", 2)

    cd_trees = tree.subtrees(lambda x: x.node == "CD")
    for cd_tree in cd_trees:
        parent_node = cd_tree.parent()
        if parent_node.node == "NP":
            parent_node_children = [parent_node[i].node for i in range(0, len(parent_node))]
            if "CD" in parent_node_children and ("NN" in parent_node_children or "NNS" in parent_node_children):
                parent_node.remove(cd_tree)
                log("REMOVED only child CD node", 2)

    if collapse_redundant_sbar:
        for sbar_tree in tree.subtrees(lambda x: x.node == "SBAR"):
            if len(sbar_tree) == 1 and sbar_tree[0].node in semi_tree_roots:
                sbar_child = sbar_tree[0]
                sbar_parent = sbar_tree.parent()
                index = sbar_parent.index(sbar_tree)
                sbar_parent.remove(sbar_tree)
                sbar_tree.remove(sbar_child)
                sbar_parent.insert(index, sbar_child)
                log("Collapsed SBAR", 2)
Пример #29
0
def is_possible_sentence(tree):
    """Perform some basic filtering to remove unlikely constructs, like
    starting a setnence with because"""
    leaf_trees = tree.subtrees(lambda x: x.height() == 2)
    leaf_nodes = [n.node for n in leaf_trees]

    if leaf_nodes[0] in invalid_boundary_tags:
        log(
            "Rejecting sentence becuase it starts with an invalid boundry tag: %s"
            % (leaf_nodes[0], ), 3)
    elif leaf_nodes[-1] in invalid_boundary_tags:
        log(
            "Rejecting sentence becuase it ends with an invalid boundry tag: %s"
            % (leaf_nodes[-1], ), 3)
        return False
    elif leaf_nodes[0] == "PP":
        log("Rejecting sentence because it stats with PP", 3)
        return False
    else:
        flatten_tags = []
        useful_roots = list(
            tree.subtrees(lambda x:
                          (x.node in semi_tree_roots) and len(x) > 1))

        if len(useful_roots) == 0 or len(useful_roots[0]) < 2:
            log("Rejecting sentence becuase can't find a useful root", 3)
            return False

        sub_tree = useful_roots[0]

        for sub_sub_tree in sub_tree:
            flatten_tags.append(tag_utils.simplify_tag(sub_sub_tree.node))

        sen_is_inverted = tree[0].node == "SINV"

        if sen_is_inverted:
            early_set = ("VP", "VB")
            late_set = ("NP", )
        else:
            early_set = ("NP", "NN")
            late_set = ("VP", "ADJP")

        try:
            earliest_index = min([
                flatten_tags.index(tag) for tag in early_set
                if tag in flatten_tags
            ])
            latest_index = max([
                flatten_tags.index(tag) for tag in late_set
                if tag in flatten_tags
            ])
            if earliest_index > latest_index:
                if sen_is_inverted:
                    log(
                        "Rejecting possible sentence because earliest NP like tag occurs before earliest VP like tag (%d vs %d)"
                        % (earliest_index, latest_index), 3)
                else:
                    log(
                        "Rejecting possible sentence because earliest VP like tag occurs before earliest NP like tag (%d vs %d) and sentence parse SINV"
                        % (earliest_index, latest_index), 3)
                return False
            else:
                return True
        except ValueError:
            log(
                "Rejecting possible sentence because the head structure doesn't look like a valid parse",
                3)
            return False
Пример #30
0
def check_node_agreement(tree_one, tree_two):

    # First determine which node is the noun node
    if tree_one.node in noun_tags and tree_two.node in noun_tags:
        best_pair = select_best_noun_verb(tree_one, tree_two)
        if best_pair:
            noun_tree, verb_tree = best_pair
        else:
            return False
    elif tree_one.node in noun_tags:
        noun_tree, verb_tree = tree_one, tree_two
    elif tree_two.node in noun_tags:
        verb_tree, noun_tree = tree_one, tree_two
    else:
        raise Exception("No noun tree in this agreement pair!")

    if noun_tree.node in singluar_noun_tags:
        noun_3rd_person = True
        noun_singular = True
    elif noun_tree.node in plural_noun_tags:
        noun_3rd_person = True
        noun_singular = False
    # In pronoun siutation and need to disambiguate
    elif noun_tree.node == "PRP":
        noun_3rd_person = not is_pronoun_first_person(noun_tree)
        noun_singular = is_pronoun_singluar(noun_tree)
    else:
        raise Exception("Received some unrecognized noun tag: %s" % (noun_tree.node,))

    if verb_tree.node not in verb_tags:
        closest_verb_tree = find_commanding_verb_tree(verb_tree)
        if closest_verb_tree:
            verb_tree = closest_verb_tree[0]

    if not verb_tree.node in verb_tags:
        raise Exception("No verb in this agrement pair!")
    if verb_tree.node in singular_verb_tags:
        verb_singular = True
    elif verb_tree.node in plural_verb_tags:
        verb_singular = False
    else:
        verb_singular = True

    log("Noun: Looks like '%s-%s' is %s (%s)" % (noun_tree[0], noun_tree.node, 'Singular' if noun_singular else 'Plural', "3rd" if noun_3rd_person else "1st"), 2)
    log("Verb: Looks like '%s-%s' is '%s" % (verb_tree[0], verb_tree.node, 'Singular' if verb_singular else 'Plural'), 2)

    noun_1st_person = not noun_3rd_person
    is_vbp = verb_tree.node == "VBP"
    is_vbz = verb_tree.node == "VBZ"

    if verb_tree.node in general_verb_tags:
        return True
    elif noun_singular and noun_1st_person and is_vbp:
        return True
    elif noun_singular and noun_3rd_person and is_vbz:
        return True
    elif not noun_singular and noun_3rd_person and is_vbp:
        return True
    else:
        log("DONT LIKE COMBO: %s" % ({"verb_tag": verb_tree.node, "noun_1st_person": noun_1st_person, "noun_singular": noun_singular},), 2)
        return False
Пример #31
0
                sentence_prob_boost = boost_for_sentence_tree(sentence_tree)
                attempt_sentence_prob *= sentence_prob_boost

                prob_for_sentences.append(attempt_sentence_prob)
                stored_probs[possible_sentence] = attempt_sentence_prob
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, attempt_sentence_prob)
        weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1))
        if weighted_score > 0:
            log("Valid Parse: %s" % (possible_sentences,), 2)
            log(weighted_score, 2)

        all_possible_sentence_probs.append(weighted_score)
    max_prob = max(all_possible_sentence_probs)
    parse_for_max_prob = all_possible_sentences[all_possible_sentence_probs.index(max_prob)]
    log("All Probs: %s" % (all_possible_sentence_probs,), 2)
    log("MAX Prob: %f" % (max_prob,), 2)
    log("Parse for max prob: %s" % (parse_for_max_prob,), 2)
    log("Best Guess Num Sentences: %d" % (len(parse_for_max_prob),), 1)
    log("-------------\n\n", 1)

    if use_cache:
        cache_set("sentence_tokenizer", line, (parse_for_max_prob, max_prob))
    return (parse_for_max_prob, max_prob) if include_prob else parse_for_max_prob


if __name__ == '__main__':
    ## Simple method for testing from STDIN
    if use_stdin:
        print parse_sentences(cmd_utils.get_stdin())
    else:
Пример #32
0
                prob_for_sentences.append(attempt_sentence_prob)
                stored_probs[possible_sentence] = attempt_sentence_prob
                if use_cache:
                    cache_set('possible_sentences', possible_sentence,
                              attempt_sentence_prob)
        weighted_score = prod(prob_for_sentences) * (weight**(
            len(possible_sentences) - 1))
        if weighted_score > 0:
            log("Valid Parse: %s" % (possible_sentences, ), 2)
            log(weighted_score, 2)

        all_possible_sentence_probs.append(weighted_score)
    max_prob = max(all_possible_sentence_probs)
    parse_for_max_prob = all_possible_sentences[
        all_possible_sentence_probs.index(max_prob)]
    log("All Probs: %s" % (all_possible_sentence_probs, ), 2)
    log("MAX Prob: %f" % (max_prob, ), 2)
    log("Parse for max prob: %s" % (parse_for_max_prob, ), 2)
    log("Best Guess Num Sentences: %d" % (len(parse_for_max_prob), ), 1)
    log("-------------\n\n", 1)

    if use_cache:
        cache_set("sentence_tokenizer", line, (parse_for_max_prob, max_prob))
    return (parse_for_max_prob,
            max_prob) if include_prob else parse_for_max_prob


if __name__ == '__main__':
    ## Simple method for testing from STDIN
    if use_stdin:
        print parse_sentences(cmd_utils.get_stdin())
Пример #33
0
def parse(text, use_cache=True):
    num_agrees = 0
    num_not_agrees = 0
    num_unsure = 0

    lines = text.split("\n")
    for line in lines:
        sentences = sentence_tokenizer.parse(line, use_cache=use_cache)
        for sentence in sentences:

            line_agreements, line_non_agreements, line_unsure = 0, 0, 0

            # Possession seems to be tricky for the parser, so we fudge
            # a little here
            sentence = sentence.replace("'s", '')
            if sentence[-1] != ".":
                sentence += "."

            if use_cache:
                cache_rs = cache_utils.cache_get('sub_verb_agreement',
                                                 sentence)
                if cache_rs:
                    line_agreements, line_non_agreements, line_unsure = cache_rs
                    num_agrees += line_agreements
                    num_not_agrees += line_non_agreements
                    num_unsure += line_unsure
                    continue

            log("Looking for Sub-Verb agreement in '%s'" % (sentence, ), 1)

            tree = parsers.parse(sentence)[0]
            dependencies = parsers.dependences(sentence)
            sub_verb_deps = [
                dep for dep in dependencies if dep['dep_name'] == 'nsubj'
            ]

            if len(sub_verb_deps) == 0:
                log("Couldn't find Subject-Verb dependency info", 1)
                cache_utils.cache_set('sub_verb_agreement', sentence,
                                      (0, 0, 0))
                continue

            for sub_verb in sub_verb_deps:
                first_node = node_in_tree(tree, sub_verb['first_word'])
                sec_node = node_in_tree(tree, sub_verb['second_word'])
                if first_node and sec_node:

                    log("First Dep Node: %s" % (first_node, ), 2)
                    log("Sec Dep Node: %s" % (sec_node, ), 2)

                    try:
                        is_agreement = check_node_agreement(
                            first_node, sec_node)
                        if is_agreement:
                            line_agreements += 1
                        else:
                            line_non_agreements += 1
                        log("Agreement in sentence? %s" % (is_agreement, ), 1)
                    except Exception as e:
                        line_unsure += 1
                        log("Error looking for agreement? %s" % (e.message, ),
                            2)

                        # No agreement in pair.  Not sure how to handle.
                        # More exhaustive search?
            if use_cache:
                cache_utils.cache_set(
                    'sub_verb_agreement', sentence,
                    (line_agreements, line_non_agreements, line_unsure))
            num_agrees += line_agreements
            num_not_agrees += line_non_agreements
            num_unsure += line_unsure

    return num_agrees, num_not_agrees, num_unsure
Пример #34
0
def check_node_agreement(tree_one, tree_two):

    # First determine which node is the noun node
    if tree_one.node in noun_tags and tree_two.node in noun_tags:
        best_pair = select_best_noun_verb(tree_one, tree_two)
        if best_pair:
            noun_tree, verb_tree = best_pair
        else:
            return False
    elif tree_one.node in noun_tags:
        noun_tree, verb_tree = tree_one, tree_two
    elif tree_two.node in noun_tags:
        verb_tree, noun_tree = tree_one, tree_two
    else:
        raise Exception("No noun tree in this agreement pair!")

    if noun_tree.node in singluar_noun_tags:
        noun_3rd_person = True
        noun_singular = True
    elif noun_tree.node in plural_noun_tags:
        noun_3rd_person = True
        noun_singular = False
    # In pronoun siutation and need to disambiguate
    elif noun_tree.node == "PRP":
        noun_3rd_person = not is_pronoun_first_person(noun_tree)
        noun_singular = is_pronoun_singluar(noun_tree)
    else:
        raise Exception("Received some unrecognized noun tag: %s" %
                        (noun_tree.node, ))

    if verb_tree.node not in verb_tags:
        closest_verb_tree = find_commanding_verb_tree(verb_tree)
        if closest_verb_tree:
            verb_tree = closest_verb_tree[0]

    if not verb_tree.node in verb_tags:
        raise Exception("No verb in this agrement pair!")
    if verb_tree.node in singular_verb_tags:
        verb_singular = True
    elif verb_tree.node in plural_verb_tags:
        verb_singular = False
    else:
        verb_singular = True

    log(
        "Noun: Looks like '%s-%s' is %s (%s)" %
        (noun_tree[0], noun_tree.node, 'Singular' if noun_singular else
         'Plural', "3rd" if noun_3rd_person else "1st"), 2)
    log(
        "Verb: Looks like '%s-%s' is '%s" %
        (verb_tree[0], verb_tree.node,
         'Singular' if verb_singular else 'Plural'), 2)

    noun_1st_person = not noun_3rd_person
    is_vbp = verb_tree.node == "VBP"
    is_vbz = verb_tree.node == "VBZ"

    if verb_tree.node in general_verb_tags:
        return True
    elif noun_singular and noun_1st_person and is_vbp:
        return True
    elif noun_singular and noun_3rd_person and is_vbz:
        return True
    elif not noun_singular and noun_3rd_person and is_vbp:
        return True
    else:
        log(
            "DONT LIKE COMBO: %s" % ({
                "verb_tag": verb_tree.node,
                "noun_1st_person": noun_1st_person,
                "noun_singular": noun_singular
            }, ), 2)
        return False
Пример #35
0
def simplify_tree(tree,
                  remove_starting_cc=False,
                  trim_adjecent_prop_nouns=False,
                  normalize_sent_roots=False,
                  normalize_case=False,
                  normalize_plural=False,
                  collapse_redundant_sbar=True):
    """Do some transformations on a parse tree to normalize it.  Currently:
        - Remove CD when they're paired with a noun
        - Stripping off conjunction from the beginning of the sentence / root
        of the tree
        - Remove proper nouns that are next to each other
    """
    if normalize_plural:
        plural_transforms = dict(
            NNS='NN',
            NNPS='NNP',
        )
        for a_tree in tree.subtrees(lambda x: x.node in plural_transforms):
            a_tree.node = plural_transforms[a_tree.node]

    if normalize_case:
        case_transforms = dict(VBD='VB',
                               VBG='VB',
                               VBN='VB',
                               VBP='VB',
                               VBZ='VB')
        for a_tree in tree.subtrees(lambda x: x.node in case_transforms):
            a_tree.node = case_transforms[a_tree.node]

    if normalize_sent_roots:
        for a_tree in tree.subtrees(lambda x: x.node in semi_tree_roots):
            a_tree.node = "S"

    if trim_adjecent_prop_nouns:
        np_trees = list(tree.subtrees(lambda x: x.node == "NP"))
        if len(np_trees) > 0:
            for np_tree in np_trees:
                num_leaves = len(np_tree)
                change = False
                if num_leaves >= 2:
                    for i in range(0, num_leaves - 1):
                        if not change:
                            if np_tree[i].node == "NNP" and np_tree[
                                    i + 1].node == "NNP":
                                np_tree.remove(np_tree[i + 1])
                                change = True

    if remove_starting_cc:
        useful_roots = list(tree.subtrees(lambda x: x.node in semi_tree_roots))
        if len(useful_roots) > 0:
            useful_root = useful_roots[0]
            if useful_root[0].node == "CC":
                useful_root.remove(useful_root[0])
                log("REMOVED CC from start of sentence", 2)

    cd_trees = tree.subtrees(lambda x: x.node == "CD")
    for cd_tree in cd_trees:
        parent_node = cd_tree.parent()
        if parent_node.node == "NP":
            parent_node_children = [
                parent_node[i].node for i in range(0, len(parent_node))
            ]
            if "CD" in parent_node_children and ("NN" in parent_node_children
                                                 or "NNS"
                                                 in parent_node_children):
                parent_node.remove(cd_tree)
                log("REMOVED only child CD node", 2)

    if collapse_redundant_sbar:
        for sbar_tree in tree.subtrees(lambda x: x.node == 'SBAR'):
            if len(sbar_tree) == 1 and sbar_tree[0].node in semi_tree_roots:
                sbar_child = sbar_tree[0]
                sbar_parent = sbar_tree.parent()
                index = sbar_parent.index(sbar_tree)
                sbar_parent.remove(sbar_tree)
                sbar_tree.remove(sbar_child)
                sbar_parent.insert(index, sbar_child)
                log("Collapsed SBAR", 2)