Exemplo n.º 1
0
def lexical_rules(tree, cutoff=0):
    bad_tags = ('X', 'FRAG', 'ROOT')
    rules = dict()
    for subtree in tree.subtrees(lambda x: is_valid_tag(x.node) and len(x) > 0
                                 and x.node.split('-')[0] not in bad_tags):
        productions = [
            n.node.split('-')[0] for n in subtree if not isinstance(n, str)
            and is_valid_tag(n.node) and len(subtree) > 0
        ]
        if len(productions) > 0 and len(
                set(bad_tags).intersection(productions)) == 0:
            prod = (subtree.node.split('-')[0], tuple(productions))
            rules[prod] = rules.get(prod, 0) + 1
    return rules
Exemplo n.º 2
0
def lexical_rules(tree, cutoff=0):
    bad_tags = ("X", "FRAG", "ROOT")
    rules = dict()
    for subtree in tree.subtrees(
        lambda x: is_valid_tag(x.node) and len(x) > 0 and x.node.split("-")[0] not in bad_tags
    ):
        productions = [
            n.node.split("-")[0]
            for n in subtree
            if not isinstance(n, str) and is_valid_tag(n.node) and len(subtree) > 0
        ]
        if len(productions) > 0 and len(set(bad_tags).intersection(productions)) == 0:
            prod = (subtree.node.split("-")[0], tuple(productions))
            rules[prod] = rules.get(prod, 0) + 1
    return rules
Exemplo n.º 3
0
def get_leaf_transitions():
    file_name = 'penn_leaf_transition_counts.data'

    try:
        f = open(os.path.join('cache', file_name), 'rb')
        data = pickle.load(f)
        f.close()
        return data
    except (IOError, EOFError):
        from tag_utils import is_valid_tag
        cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1)
        f = open(os.path.join('cache', file_name), 'wb')

        for sentence in nltk.corpus.treebank.parsed_sents():
            leaves = list(
                sentence.subtrees(
                    lambda x: len(x) > 0 and isinstance(x[0], basestring)))
            leaves = [
                n[0].node.split("-")[0] for n in leaves
                if n.node not in is_valid_tag(n[0].node)
            ]
            leaves = ['START'] + leaves

        cmd_utils.log("Finished building tag counts", 1)
        pickle.dump(store_transitions._counts, f)
        f.close()
        return store_transitions._counts
Exemplo n.º 4
0
def transitions_in_tree(tree):
    transitions = []
    for subtree in tree.subtrees():
        num_children = len(subtree)
        children = []
        for c_index in range(0, num_children):
            node = subtree[c_index]

            if node.__class__ == str:
                continue

            simple_node = simple_tag(node.node)
            if is_valid_tag(simple_node):
                children.append(simple_node)
        simplified_transitions = simplify_tags(children)
        if len(simplified_transitions) > 1:
            transitions.append(simplified_transitions)
    return transitions
Exemplo n.º 5
0
def transitions_in_tree(tree):
    transitions = []
    for subtree in tree.subtrees():
        num_children = len(subtree)
        children = []
        for c_index in range(0, num_children):
            node = subtree[c_index]

            if node.__class__ == str:
                continue

            simple_node = simple_tag(node.node)
            if is_valid_tag(simple_node):
                children.append(simple_node)
        simplified_transitions = simplify_tags(children)
        if len(simplified_transitions) > 1:
            transitions.append(simplified_transitions)
    return transitions
Exemplo n.º 6
0
def get_leaf_transitions():
    file_name = 'penn_leaf_transition_counts.data'

    try:
        f = open(os.path.join('cache', file_name), 'rb')
        data = pickle.load(f)
        f.close()
        return data
    except (IOError, EOFError):
        from tag_utils import is_valid_tag
        cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1)
        f = open(os.path.join('cache', file_name), 'wb')

        for sentence in nltk.corpus.treebank.parsed_sents():
            leaves = list(sentence.subtrees(lambda x: len(x) > 0 and isinstance(x[0], basestring)))
            leaves = [n[0].node.split("-")[0] for n in leaves if n.node not in is_valid_tag(n[0].node)]
            leaves = ['START'] + leaves

        cmd_utils.log("Finished building tag counts", 1)
        pickle.dump(store_transitions._counts, f)
        f.close()
        return store_transitions._counts