Пример #1
0
 def test_empty(self):
     original = 'What is witch hazel?'
     result = json.dumps(cky.cky(original.split(' '), 'SBARQ', self.prepared[0], self.prepared[1]))
     print result  # Yeah, a side effect however I'd like to see this in input.
     self.assertEqual(result,
                      '["SBARQ", ["WHNP+PRON", "What"], ["SQ+VP", ["VERB", "is"], ["NP", ["NOUN", "_RARE_"], '
                      '["NOUN", "_RARE_"]]]]')
Пример #2
0
def insideOutside(sentence, grammar, count):
    """
    insideOutside() finds the expected number of counts for rules in our
    grammar, given a sentence.

    @params: sentence (list of strings), grammar and count dictionary.
    @return: n/a (updates count dictionary).
    """
    n = len(sentence)
    
    trees = cky.cky(grammar, sentence)
    trees_top = []
    for tree in trees:
        if tree.root == 'TOP':
            trees_top.append(tree)

    # cky.printParseTrees(trees_top)
    inside = getAlpha(sentence, grammar, trees_top)
    # print(inside)
    outside = getBeta(sentence, grammar, trees_top, inside)
    
    Z = inside[grammar.start_symbol][0][n-1]
    mu = {lhs:[[0]*n]*n for lhs in inside}
    
    for lhs in mu:
        for i in range(n):
            for j in range(n):
                mu[lhs][i][j] = inside[lhs][i][j]*outside[lhs][i][j]
    
    gamma = {}
    for lhs in grammar.NR:
        for rule in grammar.NR[lhs].values():
            gamma[rule] = [[[0]*n]*n]*n
            for i in range(n-1):
                for j in range(i+1, n):
                    for k in range(i,j):
                        gamma[rule][i][k][j] = outside[rule.lhs][i][j]*rule.prob*inside[rule.rhs[0]][i][k]*inside[rule.rhs[1]][k+1][j]
    
    for lhs in grammar.NR:
        if lhs not in count:
            count[lhs] = {}
        for rule in grammar.NR[lhs].values():
            if tuple(rule.rhs) not in count[lhs]:
                count[lhs][tuple(rule.rhs)] = 0
            for i in range(n-1):
                for j in range(i+1, n):
                    for k in range(i, j):
                        count[lhs][tuple(rule.rhs)] += gamma[rule][i][k][j]/Z
    
    for term in grammar.TR:
        for lhs in grammar.TR[term]:
            if lhs not in count:
                count[lhs] = {}
              
    for i in range(n):
        for lhs in grammar.TR[sentence[i]]:
            if tuple([sentence[i]]) in count[lhs]:
                count[lhs][tuple([sentence[i]])] += mu[lhs][i][i]/Z
            else:
                count[lhs][tuple([sentence[i]])] = mu[lhs][i][i]/Z
Пример #3
0
from PCFG import PCFG
from cky import load_sents_to_parse, cky

if __name__ == '__main__':
    pcfg = PCFG.from_file_assert_cnf('grammar3-CNF.txt')
    good_sents = load_sents_to_parse('sents_3.txt')
    bad_sents = load_sents_to_parse('sents_bad.txt')
    print '========'
    print 'Checking good sentences!'
    print '========'
    failures = 0
    for sent in good_sents:
        if "FAILED" in cky(pcfg, sent):
            print 'FAILURE!!! Failed to parse %s' % sent
            failures += 1
    print 'Succeeded %d out of %d' % (len(good_sents) - failures,
                                      len(good_sents))

    print '========'
    print 'Checking bad sentences!'
    print '========'
    failures = 0
    for sent in bad_sents:
        if "FAILED" not in cky(pcfg, sent):
            print 'FAILURE!!! Parsed when it should have failed: %s' % sent
    print 'Succeeded %d out of %d' % (len(bad_sents) - failures,
                                      len(bad_sents))
Пример #4
0
def main():
    if len(sys.argv) == 3:
        sentences = sys.argv[2].split(' ')

        # Get the grammar.
        file_path = sys.argv[1]

        trees = []
        print 'Parsing trees in file...'
        f = open(file_path, 'rb')
        trees.extend(count_cfg.read_trees(f))

        print 'Converting trees to grammar...'
        g = grammar.Grammar(nodes = trees)

        print 'Converting to CNF...'
        g.convertToCNF()

        # Parse and get nodes back.
        print 'Running CKY...'
        nodes_back = cky.cky(g, sentences)

        # Only get the nodes back that have a TOP.
        nodes_back_top = []
        for tree in nodes_back:
            if tree.root == 'TOP':
                nodes_back_top.append(tree)

        print 'Getting best and worst tree...'
        if nodes_back_top == []:
            print('No tree could be constructed for the sentence.')
            sys.exit()
        elif len(nodes_back_top) == 1:
            print('Only one valid tree found for the sentence.')
            print(cky.getParseTree(nodes_back_top[0], 5))

        max_pot = float('-inf')
        min_pot = float('inf')
        max_tree = nodes_back_top[0]
        min_tree = nodes_back_top[0]

        for tree in nodes_back_top:
            pot_tree = potential(tree, g)
            if pot_tree > max_pot:
                max_pot = pot_tree
                max_tree = tree
            elif pot_tree < min_pot:
                min_pot = pot_tree
                min_tree = tree

        print('Max tree:')
        print(cky.getParseTree(max_tree, 5))

        print('Min tree:')
        print(cky.getParseTree(min_tree, 5))
    elif len(sys.argv) == 2:
        if os.path.isfile('grammar.p'):
            g = pickle.load(open('grammar.p', 'rb'))
        else:
            trees = []
            print 'Parsing trees'
            for path in os.listdir(sys.argv[1]):
                if path.split('.')[1] != 'prd':
                    continue
                    
                file_path = sys.argv[1]+'/'+path
                f = open(file_path, 'rb')
                trees.extend(count_cfg.read_trees(f))

            print 'Converting trees to grammar'
            g = grammar.Grammar(nodes = trees)

            print 'Converting to CNF'
            g.convertToCNF()

            pickle.dump(g, open('grammar.p', 'wb'))
        

        print 'Parsing Sentence'

        sentences = [['His', 'tall', 'frame'], 
                     ['the', 'dog', 'saved'], 
                     ['discover', 'the', 'first', 'snail'],
                     ['it', 'is', 'juxtaposed', 'well'],
                     ['Her', 'handling', 'of', 'paint'],
                     ['He', 'glowered', 'down', 'at', 'her']]

        for t in range(5):
            # num_t = [len(g.TR[lhs]) for lhs in g.TR]
            # num_n = [len(g.NR[lhs]) for lhs in g.NR]

            # print sum(num_t), sum(num_n)
            to_del = []
            count = {}
            for sent in sentences:
                insideOutside(sent, g, count)      
            
            for lhs in count:
                lhs_sum = sum(count[lhs].values())
                if lhs_sum == 0:
                    to_del.append(lhs)
                else:
                    for key,val in count[lhs].items():
                        count[lhs][key] = val/lhs_sum

            for lhs in to_del:
                del count[lhs]
            
            for lhs in count:
                for key,val in count[lhs].items():
                    rule_dat = [lhs,]
                    rule_dat.extend(list(key))
                    rule_dat.append(val)
                    g.add_rule(grammar.Rule(vals = rule_dat))

        def isTop(node):
           return node.root == 'TOP'

        for s in sentences:
            nodes_back = cky.cky(g, s)
            node_back = filter(isTop, nodes_back)
            node_back = [(node, potential(node, g)) for node in node_back]
            node_back.sort(key=lambda node: -1*node[1])
            cky.printParseTrees([node_back[0][0]])
    else:
        print('Error. Invalid number of arguments.')
        print('Two options for running:')
        print('Usage: $ inside_out.py <directory>')
        print 'Note: only files ending in .prd in the directory provided', \
            'will be read into a grammar.'
        print 'Note: .prd files need to be in s-expression form.'
        print 'OR:'
        print('Usage: $ inside_out.py <grammar file> <string to be parsed>')
        print 'Note: grammar file needs to be in s-expression form.'
        sys.exit()