Exemplo n.º 1
0
def run(options):
    restaurant = lp.ReinstantiatingCompactRestaurant()
    nodeManager = lp.SimpleNodeManager(restaurant.getFactory())
    discounts = lp.VectorDouble(DISCOUNTS)
    parameters = lp.SimpleParameters(discounts, options.alpha)

    seq = lp.VectorInt()
    lp.pushIntFileToVec(options.train_file, seq)
    print >> sys.stderr, "Train seq length: %i" % (seq.size(), )

    # initialize model
    model = lp.HPYPModel(seq, nodeManager, restaurant, parameters, NUM_TYPES)

    #insert training observations into model using particle filter
    model.computeLosses(0, seq.size())

    # add test observations to underlying sequence
    testOffset = seq.size()
    lp.pushIntFileToVec(options.test_file, seq)
    print >> sys.stderr, "Test seq length: %i" % (seq.size() - testOffset, )

    if options.prediction == 2:
        predictMode = lp.HPYPModel.BELOW
    elif options.prediction == 1:
        predictMode = lp.HPYPModel.FRAGMENT
    else:
        predictMode = lp.HPYPModel.ABOVE

    if options.inference == 1:
        for i in xrange(BURN_IN_SAMPLES):
            print >> sys.stderr, "Burn in iteration %i" % (i, )
            model.runGibbsSampler()

    if options.prediction != 3:
        loss = float(
            lp.prob2loss(
                model.predictSequence(testOffset, seq.size(), predictMode)))
    else:
        loss = float(np.mean(model.computeLosses(testOffset, seq.size())))

    if options.inference == 2 and options.prediction != 3:
        losses = np.zeros((PREDICT_SAMPLES, seq.size() - testOffset))
        for i in xrange(BURN_IN_SAMPLES):
            print >> sys.stderr, "Burn in iteration %i" % (i, )
            model.runGibbsSampler()
        for i in xrange(PREDICT_SAMPLES):
            print >> sys.stderr, "Prediction iteration %i" % (i, )
            model.runGibbsSampler()
            losses[i, :] = model.predictSequence(testOffset, seq.size(),
                                                 predictMode)
        loss = float(np.mean(-np.log2(np.mean(losses, 0))))

    print loss

    # make sure destructors are called in correct order
    del model
    del nodeManager
Exemplo n.º 2
0
def buildModel(fn):
    """Build a byte-level SM model from the given file."""
    global seq, model, nodeManager, parameters, restaurant
    #restaurant = libplump.SimpleFullRestaurant()
    #restaurant = libplump.HistogramRestaurant()
    #restaurant = libplump.KneserNeyRestaurant()
    restaurant = libplump.ReinstantiatingCompactRestaurant()
    #restaurant = libplump.StirlingCompactRestaurant()

    nodeManager = libplump.SimpleNodeManager(restaurant.getFactory())
    parameters = libplump.SimpleParameters(DISCOUNTS, CONCENTRATION)

    seq = libplump.VectorInt()
    libplump.pushCharFileToVec(fn, seq)
    numTypes = 256

    model = libplump.HPYPModel(seq, nodeManager, restaurant, parameters,
                               numTypes)
    model.computeLosses(0, seq.size())
Exemplo n.º 3
0
#restaurant = libplump.FractionalRestaurant()
#restaurant = libplump.HistogramRestaurant()
#restaurant = libplump.KneserNeyRestaurant()
#restaurant = libplump.ReinstantiatingCompactRestaurant()
#restaurant = libplump.StirlingCompactRestaurant()

nodeManager = libplump.SimpleNodeManager(restaurant.getFactory())
parameters = libplump.SimpleParameters()

#seq = libplump.vectori(range(10))
seq = libplump.VectorInt([0, 1, 2, 1, 2])
#seq = libplump.VectorInt(map(ord,'oacac'))
#numTypes = max(seq)
numTypes = 3

model = libplump.HPYPModel(seq, nodeManager, restaurant, parameters, numTypes)
print model.computeLosses(0, len(seq))
for i in range(seq.size()):
    print model.toString()
    model.runGibbsSampler()

print "Predictions after training:"
for i in range(len(seq)):
    #print model.predict(0,i,i)
    dist = model.predictiveDistribution(0, i)
    print dist, sum(dist)

# save model to file
serializer = libplump.Serializer("model.dump")
serializer.saveNodesAndPayloads(nodeManager, restaurant.getFactory())