Exemplo n.º 1
0
 def setUp(self):
     enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20)
     x = np.random.randint(0, 100, (33, 5))
     o = enc.autobuild(x)
     self.o = o[1][0]
     m = MatchScore(enc, enc)
     mo = m.autobuild(x, x)
     self.mo = mo[1][0]
Exemplo n.º 2
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0]+1, dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num+1])
        self.assertRaises(Exception, self.cemb.predict, [num+1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block, self.glove.block)     # TODO factor out matchscore tests
        idxs = np.arange(num+1)

        # glove against glove
        self.assertTrue(np.allclose(mg.predict([num, 100], [num, 100]),
                                   [np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2]))

        class NegIdxGen():
            def __init__(self, num): self.n = num
            def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape)

        m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\
            .adagrad(lr=0.1)\
            .train(numbats=50, epochs=50)

        print m.predict([num, num-1, num-2, num-1], [num, num-1, num-2, num-2])

        mrr = 0.0
        recat10 = 0.0
        recat1 = 0.0
        tot = num + 1
        for a in range(tot):
            abc = zip(range(num+1), list(m.predict([a]*(num+1), np.arange(0, num+1))))
            abc = sorted(abc, key=lambda (x, y): y, reverse=True)
            #print abc[:10]
            for i in range(len(abc)):
                if abc[i][0] == a:
                    #print i
                    mrr += 1./(1+i)
                    if i < 10:
                        recat10 += 1
                    if i < 1:
                        recat1 += 1
                    break

        mrr /= tot
        recat10 /= tot
        recat1 /= tot
        print "%.3f MRR,\t%.3f [email protected],\t%.3f [email protected]" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
Exemplo n.º 3
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.0,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, : len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(
        indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir
    )
    dist = CosineDistance() if cosine else EuclideanDistance()  # DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)

    """
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    """

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = (
        scorer.nstrain([charwordmat, np.arange(len(words) + 1)])
        .negsamplegen(NegIdxGen(len(words)))
        .negrate(negrate)
        .objective(obj)
        .adagrad(lr=lr)
        .l2(wreg)
        .train(numbats=numbats, epochs=epochs)
    )

    cwenc.save("glove2c2w.block")
Exemplo n.º 4
0
def run(
        epochs=10,
        numbats=100,
        negrate=1,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl",
        embdim=100,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        mem=False,
        membidir=False,
        memlayers=1,
        layers=1,
        testfirst=False,
        rankingloss=False,
        rlmargin=1.,
        charlevel=False,
        pool=False,
        resultsave=False,
        resultsavep="subjdetns.res.pkl",
        ):

    tt = ticktock("script")
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat\
        = readdata(datap, charlevel)

    print entmat.shape
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [innerdim/2]*layers
    else:
        encinnerdim = [innerdim]*layers

    # question representation:
    # encodes question sequence to vector
    # let's try to embed chars too <-- embdim = None if charlevel else embdim
    qenc = SimpleSeq2Vec(indim=numwords,
                        inpembdim=embdim,
                        innerdim=encinnerdim,
                        maskid=-1,
                        bidir=bidir,
                        pool=pool)

    # entity representation:
    if mem:
        # encodes label to vector
        if membidir:
            innerdim = [innerdim/2]*memlayers
        else:
            innerdim = [innerdim]*memlayers
        memembdim = embdim
        #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb  # share embeddings
        #memembdim = None if charlevel else memembdim
        meminpemb = qenc.inpemb     # also chars are embedded and embeddings are always shared
        lenc = SimpleSeq2Vec(indim=numwords,
                                inpembdim=memembdim,
                                inpemb=meminpemb,
                                innerdim=innerdim,
                                maskid=-1,
                                bidir=membidir)
    else:
        # embeds entity id to vector
        lenc = VectorEmbed(indim=numents, dim=innerdim)

    # question-entity score computation:
    scorer = MatchScore(qenc, lenc)       # batched dot

    # trainer config preparation
    class PreProcf(object):
        def __init__(self, entmat):
            self.em = Val(entmat)                # entmat: idx[word]^(numents, len(ent.name))

        def __call__(self, datas, gold):    # gold: idx^(batsize, )
            return (datas, self.em[gold, :]), {}

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):    # gold: idx^(batsize,)
            return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32")

    if testfirst:
        eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)])
        evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat))
        for e in evalres:
            print e
        tt.msg("tested dummy")
        sys.exit()
    #embed()
    # trainer config and training
    obj = lambda p, n: n - p
    if rankingloss:
        obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty)

    nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\
        .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
        .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)

    # evaluation
    eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10)])

    evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat),
                        savep=None if not resultsave else resultsavep)
    for evalre in evalres:
        print evalre