Exemplo n.º 1
0
def test_batch_basic():
    model = Numila(PARSE='batch', CHUNK_THRESHOLD=2)

    log = utils.log_parse(model, 'a b c')
    assert log.count('bump a -> b') is 1
    assert log.count('bump b -> c') is 1

    log = utils.log_parse(model, 'a b c d')
    assert log.count('bump a -> b') is 1
    assert log.count('bump b -> c') is 1
    assert log.count('bump c -> d') is 1
Exemplo n.º 2
0
def test_inc_basic():
    model = Numila(PARSE='incremental', CHUNK_THRESHOLD=2)

    log = utils.log_parse(model, 'a b c')
    assert log.count('bump a -> b') is 2
    assert log.count('bump b -> c') is 2

    log = utils.log_parse(model, 'a b c d')
    assert log.count('bump a -> b') is 3
    assert log.count('bump b -> c') is 3
    assert log.count('bump c -> d') is 3


    log = utils.log_parse(model, 'a b c d e')
    assert log.count('bump a -> b') is 3
    assert log.count('bump b -> c') is 3
    assert log.count('bump c -> d') is 3
    assert log.count('bump d -> e') is 3
Exemplo n.º 3
0
def test_easy(model):
    model.params['CHUNK_THRESHOLD'] = 2
    # One simple utterance 50 times.
    utterance = 'a b a c a b d'
    corpus = [utterance] * 50
    model.parse(corpus[0])
    print(utils.log_parse(model, corpus[0]))
    a, b, c, d = (model.graph[x] for x in 'abcd')  # node objects
    def weight(edge, n1, n2):
        return n1.edge_weight(n2, edge)

    # Check that all connections are positive after one utterance
    for x, y in utils.neighbors(utterance.split(' ')):
        assert weight('ftp', model.graph[x], model.graph[y])
        assert weight('btp', model.graph[y], model.graph[x])

    # Equal conditional probability, but more evidence
    #assert weight('btp', b, a) < weight('btp', c, a)

    model.fit(corpus)

    # Check that weights don't change when they shouldn't change.
    w1 = weight('ftp', a, b)
    model.parse('b c')
    w2 = weight('ftp', a, b)
    assert w1 - w2 < .001

    w1 = weight('btp', b, a)
    model.parse('d a d a d a d a d a d a')
    w2 = weight('btp', b, a)
    assert w1 - w2 < .001

    
    # Check that more common edges are more highly weighted.
    # We vary the conditional (ab | a) and raw (ab) probabilities.
    # Reference: a b a c a b d

    # Higher conditional, higher raw.
    assert weight('ftp', a, b) > weight('ftp', a, c)
    
    # Higher conditional, equal raw.
    assert weight('ftp', c, a) > weight('ftp', b, d)

    return  # TODO

    # Equal conditional, higher raw. But lots of evidence for both.
    print()
    print(weight('btp', c, a, verbose=True))
    assert 0

    assert weight('btp', b, a) - weight('btp', c, a) < 0.001

    
    # This always fails for vector. The edge weights do not really
    # represent probabilities. They are more sensitive to the raw
    # occurrence counts.
    # p(ab | a) = 0.66
    # p(ca | c) = 1
    # p(ab) = 0.4
    # p(ca) = 0.2
    #assert weight('ftp', c, a) > weight('ftp', a, b)

    assert weight('ftp', a, a) < 0.05
    assert weight('ftp', b, b) < 0.05
    assert weight('ftp', c, c) < 0.05
    assert weight('ftp', b, c) < 0.05