Exemplo n.º 1
0
 def __init__(self,
              inpemb,
              encdim=100,
              scadim=100,
              maskid=0,
              bidir=False,
              scalayers=1,
              enclayers=1,
              outdim=100,
              **kw):
     super(CustomSeq2Pair, self).__init__(**kw)
     self.tosca = SimpleSeq2Sca(inpemb=inpemb,
                                inpembdim=inpemb.outdim,
                                innerdim=scadim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=scalayers)
     self.subjenc = SimpleSeq2Vec(inpemb=inpemb,
                                  inpembdim=inpemb.outdim,
                                  innerdim=encdim,
                                  maskid=maskid,
                                  bidir=bidir,
                                  layers=enclayers)
     self.predenc = SimpleSeq2Vec(inpemb=inpemb,
                                  inpembdim=inpemb.outdim,
                                  innerdim=encdim,
                                  maskid=maskid,
                                  bidir=bidir,
                                  layers=enclayers)
     self.subjmd = MatDot(self.subjenc.outdim, outdim)
     self.predmd = MatDot(self.predenc.outdim, outdim)
Exemplo n.º 2
0
 def setUp(self):
     enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20)
     x = np.random.randint(0, 100, (33, 5))
     o = enc.autobuild(x)
     self.o = o[1][0]
     m = MatchScore(enc, enc)
     mo = m.autobuild(x, x)
     self.mo = mo[1][0]
Exemplo n.º 3
0
 def test_mask(self):
     np.random.seed(1337)
     enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=4, maskid=-1, layers=2).all_outputs()
     x = np.random.randint(0, 100, (33, 5))
     maskr = np.random.randint(1, x.shape[1], (x.shape[0],))
     for i in range(x.shape[0]):
         x[i, maskr[i]:] = -1
     pred = enc.predict(x)
     print maskr
     print x
     print pred
     print pred.shape
Exemplo n.º 4
0
    def test_auto_mask_within_seq2vec(self):
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        batsize = 11
        seqlen = 3
        seqblank = 2
        wordlen = 3
        wordblank = 2
        numchars = 20
        numwords = 100
        encdim = 4
        embdim = 50
        innerdim = 2

        worddata = np.random.randint(0, numwords, (batsize, seqlen, 1))
        worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1
        worddata = np.concatenate([worddata, worddatablank], axis=1)
        chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen))
        charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblank], axis=2)
        charblankblank = np.zeros(
            (batsize, seqblank, wordlen + wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblankblank], axis=1)
        data = np.concatenate([worddata, chardata], axis=2)

        wordemb = WordEncoderPlusGlove(numchars=numchars,
                                       numwords=numwords,
                                       encdim=encdim,
                                       embdim=embdim,
                                       maskid=-1,
                                       embtrainfrac=0)
        rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim,
                                             innerdim,
                                             bidir=False)
        enc = Seq2Vec(wordemb, rnn, maskid=-1)
        enc.enc.with_outputs()
        finalpred, pred = enc.predict(data)
        #print pred.shape, finalpred.shape
        #print pred[0], finalpred[0]
        i = 1
        while i < pred.shape[1]:
            self.assertEqual(np.allclose(pred[:, i - 1, :], pred[:, i, :]),
                             i >= seqlen)
            i += 1
Exemplo n.º 5
0
def run(epochs=50,
        numbats=25,
        lr=0.1,
        layers=1,
        embdim=100,
        encdim=200,
        bidir=False,
        mode="wordchar",        # "char" or "word" or "wordchar"
        maxlen=75,
        maxwordlen=15,
        ):
    maskid = -1
    (traindata, traingold), (testdata, testgold), dic = \
        readdata("../../../data/hatespeech/train.csv",
                 "../../../data/hatespeech/test.csv",
                 masksym=maskid, mode=mode, maxlen=maxlen)

    # data stats
    print "class distribution in train: {}% positive".format(np.sum(traingold)*1. / np.sum(np.ones_like(traingold)))
    print "class distribution in test: {}% positive".format(np.sum(testgold)*1. / np.sum(np.ones_like(testgold)))

    inpemb = VectorEmbed(indim=len(dic), dim=embdim)
    encdim = [encdim] * layers
    if mode == "wordchar":
        enc = WordCharSentEnc(charemb=inpemb, charinnerdim=embdim,
                              wordemb=False, wordinnerdim=encdim,
                              maskid=maskid, bidir=bidir)
    else:
        enc = SimpleSeq2Vec(inpemb=inpemb, innerdim=encdim, maskid=maskid, bidir=bidir)

    m = SMOWrap(enc, outdim=2, nobias=True)
    #print enc.predict(traindata[:5, :])
    m = m.train([traindata], traingold)\
        .adadelta(lr=lr).grad_total_norm(1.0)\
        .cross_entropy().split_validate(6, random=True).cross_entropy().accuracy()\
        .train(numbats=numbats, epochs=epochs)

    m.save("hatemodel.{}.Emb{}D.Enc{}D.{}L.model".format(mode, embdim, encdim, layers))
Exemplo n.º 6
0
    def test_auto_mask_within_seq2vec(self):
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        batsize = 11
        seqlen = 3
        seqblank = 2
        wordlen = 3
        wordblank = 2
        numchars = 20
        numwords = 100
        encdim = 4
        embdim = 50
        innerdim = 2

        worddata = np.random.randint(0, numwords, (batsize, seqlen, 1))
        worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1
        worddata = np.concatenate([worddata, worddatablank], axis=1)
        chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen))
        charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblank], axis=2)
        charblankblank = np.zeros((batsize, seqblank, wordlen+wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblankblank], axis=1)
        data = np.concatenate([worddata, chardata], axis=2)

        wordemb = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim,
                                       maskid=-1,
                                       embtrainfrac=0)
        rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, innerdim, bidir=False)
        enc = Seq2Vec(wordemb, rnn, maskid=-1)
        enc.enc.with_outputs
        finalpred, pred = enc.predict(data)
        #print pred.shape, finalpred.shape
        #print pred[0], finalpred[0]
        i = 1
        while i < pred.shape[1]:
            self.assertEqual(np.allclose(pred[:, i-1, :], pred[:, i, :]), i >= seqlen)
            i += 1
Exemplo n.º 7
0
def run(
    epochs=10,
    numbats=100,
    numsam=10000,
    lr=0.1,
    datap="../../../data/simplequestions/datamat.char.pkl",
    innerdim=200,
    wreg=0.00005,
    bidir=False,
    keepmincount=5,
    mem=False,
    sameenc=False,
    memaddr="dot",
    memattdim=100,
    membidir=False,
    memlayers=1,
    memmaxwords=5,
    memmaxchars=20,
    layers=1,
):

    (traindata, traingold), (validdata, validgold), (testdata, testgold), chardic, entdic\
        = readdata(datap)

    if mem:
        memdata = getcharmemdata(entdic,
                                 chardic,
                                 maxwords=memmaxwords,
                                 maxchar=memmaxchars)

    print traindata.shape, testdata.shape

    numchars = max(chardic.values()) + 1
    numrels = max(entdic.values()) + 1
    print numchars, numrels

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    enc = SimpleSeq2Vec(indim=numchars,
                        inpembdim=None,
                        innerdim=encinnerdim,
                        maskid=-1,
                        bidir=bidir)

    if mem:
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        memindim = numchars
        memenc = SimpleSeq2Vec(indim=memindim,
                               inpembdim=None,
                               innerdim=innerdim,
                               maskid=-1,
                               bidir=membidir)
        if memaddr is None or memaddr == "dot":
            memaddr = DotMemAddr
        elif memaddr == "lin":
            memaddr = LinearGateMemAddr
        dec = MemVec2Idx(memenc,
                         memdata,
                         memdim=innerdim,
                         memaddr=memaddr,
                         memattdim=memattdim)
    else:
        dec = SimpleVec2Idx(indim=innerdim, outdim=numrels)

    m = Seq2Idx(enc, dec)

    m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\
        .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\
        .train(numbats=numbats, epochs=epochs)

    pred = m.predict(testdata)
    print pred.shape
    evalres = evaluate(np.argmax(pred, axis=1), testgold)
    print str(evalres) + "%"
Exemplo n.º 8
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl",
    embdim=100,
    innerdim=200,
    wreg=0.00005,
    bidir=False,
    mem=False,
    membidir=False,
    memlayers=1,
    layers=1,
    testfirst=False,
    rankingloss=False,
    rlmargin=1.,
    charlevel=False,
    pool=False,
    resultsave=False,
    resultsavep="subjdetns.res.pkl",
):

    tt = ticktock("script")
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat\
        = readdata(datap, charlevel)

    print entmat.shape
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    # question representation:
    # encodes question sequence to vector
    # let's try to embed chars too <-- embdim = None if charlevel else embdim
    qenc = SimpleSeq2Vec(indim=numwords,
                         inpembdim=embdim,
                         innerdim=encinnerdim,
                         maskid=-1,
                         bidir=bidir,
                         pool=pool)

    # entity representation:
    if mem:
        # encodes label to vector
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        memembdim = embdim
        #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb  # share embeddings
        #memembdim = None if charlevel else memembdim
        meminpemb = qenc.inpemb  # also chars are embedded and embeddings are always shared
        lenc = SimpleSeq2Vec(indim=numwords,
                             inpembdim=memembdim,
                             inpemb=meminpemb,
                             innerdim=innerdim,
                             maskid=-1,
                             bidir=membidir)
    else:
        # embeds entity id to vector
        lenc = VectorEmbed(indim=numents, dim=innerdim)

    # question-entity score computation:
    scorer = MatchScore(qenc, lenc)  # batched dot

    # trainer config preparation
    class PreProcf(object):
        def __init__(self, entmat):
            self.em = Val(entmat)  # entmat: idx[word]^(numents, len(ent.name))

        def __call__(self, datas, gold):  # gold: idx^(batsize, )
            return (datas, self.em[gold, :]), {}

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):  # gold: idx^(batsize,)
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if testfirst:
        eval = SubjRankEval(scorer,
                            worddic=worddic,
                            entdic=entdic,
                            metrics=[ClassAccuracy(),
                                     RecallAt(5)])
        evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat))
        for e in evalres:
            print e
        tt.msg("tested dummy")
        sys.exit()
    #embed()
    # trainer config and training
    obj = lambda p, n: n - p
    if rankingloss:
        obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty)

    nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\
        .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
        .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)

    # evaluation
    eval = SubjRankEval(scorer,
                        worddic=worddic,
                        entdic=entdic,
                        metrics=[
                            ClassAccuracy(),
                            RecallAt(1),
                            RecallAt(2),
                            RecallAt(5),
                            RecallAt(10)
                        ])

    evalres = eval.eval(testdata,
                        testgold,
                        transform=PreProcf(entmat),
                        savep=None if not resultsave else resultsavep)
    for evalre in evalres:
        print evalre
Exemplo n.º 9
0
def run(epochs=10,
        numbats=100,
        numsam=10000,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.wordchar.pkl",
        embdim=50,
        encdim=50,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        keepmincount=5,
        sameenc=False,
        memaddr="dot",
        memattdim=100,
        layers=1,
        embtrainfrac=0.0,
        mem=False,
        membidir=False,
        memlayers=1,
        sharedwordenc=False):
    """ Memory match-based glove-based word-level relation classification """

    (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\
        = readdata(datap)

    # get words from relation names, update word dic
    memdata = getmemdata(entdic, worddic, chardic)

    # get glove and transform word mats to glove index space
    d2g, newdic, glove = getdic2glove(worddic,
                                      dim=embdim,
                                      trainfrac=embtrainfrac)
    traindata, validdata, testdata, memdata = \
        [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2)
         for x in [traindata, validdata, testdata, memdata]]

    print traindata.shape, testdata.shape
    #embed()

    numwords = max(worddic.values()) + 1  # don't use this, use glove
    numchars = max(chardic.values()) + 1
    numrels = max(entdic.values()) + 1

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    wordemb = WordEncoderPlusGlove(numchars=numchars,
                                   encdim=encdim,
                                   embdim=embdim,
                                   maskid=-1,
                                   embtrainfrac=embtrainfrac)
    rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim,
                                         encinnerdim,
                                         bidir=bidir)
    enc = Seq2Vec(wordemb, rnn, maskid=-1)

    if mem:
        memembdim = embdim
        memencdim = encdim
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        if not sharedwordenc:
            memwordemb = WordEncoderPlusGlove(numchars=numchars,
                                              encdim=encdim,
                                              embdim=embdim,
                                              maskid=-1,
                                              embtrainfrac=embtrainfrac)
        else:
            memwordemb = wordemb
        memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim + memencdim,
                                                   innerdim,
                                                   bidir=membidir)
        memenc = Seq2Vec(memwordemb, memrnn, maskid=-1)
        if memaddr is None or memaddr == "dot":
            memaddr = DotMemAddr
        elif memaddr == "lin":
            memaddr = LinearGateMemAddr
        dec = MemVec2Idx(memenc,
                         memdata,
                         memdim=innerdim,
                         memaddr=memaddr,
                         memattdim=memattdim)
    else:
        dec = SimpleVec2Idx(indim=innerdim, outdim=numrels)

    m = Seq2Idx(enc, dec)

    m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\
        .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\
        .train(numbats=numbats, epochs=epochs)

    pred = m.predict(testdata)
    print pred.shape
    evalres = evaluate(np.argmax(pred, axis=1), testgold)
    print str(evalres) + "%"
Exemplo n.º 10
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        preeval = True
        #specemb = 100
        margin = 1.
        evalsplits = 1
        #usetypes=True
        #mode = "charword"
        #checkdata = True
    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)

    subjenc = EntEnc(
        SimpleSeq2Vec(invocsize=numwords,
                      inpembdim=embdim,
                      innerdim=decinnerdim,
                      maskid=maskid,
                      bidir=membidir))

    numentembs = len(np.unique(entmat[:, 0]))
    repsplit = entmat[relstarts, 0]
    if specids:  # include vectorembedder
        subjenc = EntEmbEnc(subjenc, numentembs, specemb)
    predenc = VectorEmbed(indim=numents - relstarts + 1,
                          dim=subjenc.outdim,
                          init="zero")
    entenc = CustomEntEnc(subjenc, predenc, repsplit)

    inpenc = CustomSeq2Pair(inpemb=emb,
                            encdim=encinnerdim,
                            scadim=encinnerdim,
                            enclayers=layers,
                            scalayers=layers,
                            bidir=bidir,
                            maskid=maskid,
                            outdim=subjenc.outdim)

    # adjust params for enc/dec construction
    # encinnerdim[-1] += specemb
    # innerdim[-1] += specemb

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    if sumhingeloss:
        scorerkwargs["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(self, datas, gold):
            entrand = np.random.randint(self.min, self.midsplit,
                                        (gold.shape[0], 1))
            relrand = np.random.randint(self.midsplit, self.max,
                                        (gold.shape[0], 1))
            ret = np.concatenate([entrand, relrand], axis=1)
            return datas, ret.astype("int32")

    #embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:  #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    # embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = CustomRankSearch(inpenc,
                             entenc,
                             scorer.s,
                             scorer.agg,
                             relstarts=relstarts)
        eval = FullRankEval()
        pred, scores = s.search(testdata,
                                testgold.shape[1],
                                candata=entmat,
                                canids=canids,
                                split=evalsplits,
                                transform=transf.f,
                                debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    if not loadmodel:
        tt.tick("training")
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \
            .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
            .validate_on([validdata, validgold]) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained")
        scorer.save("customfullrank.scorer.save")
    else:
        scorer = SeqMatchScore.load("customfullrank.scorer.save")

    # eval
    tt.tick("evaluating")

    s = CustomRankSearch(inpenc,
                         entenc,
                         scorer.s,
                         scorer.agg,
                         relstarts=relstarts)
    eval = FullRankEval()
    pred, scores = s.search(testdata,
                            testgold.shape[1],
                            candata=entmat,
                            canids=canids,
                            split=evalsplits,
                            transform=transf.f,
                            debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(1000):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Exemplo n.º 11
0
def run(
        epochs=10,
        numbats=100,
        numsam=10000,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.wordchar.pkl",
        embdim=50,
        encdim=50,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        keepmincount=5,
        sameenc=False,
        memaddr="dot",
        memattdim=100,
        layers=1,
        embtrainfrac=0.0,
        mem=False,
        membidir=False,
        memlayers=1,
        sharedwordenc=False
        ):
    """ Memory match-based glove-based word-level relation classification """

    (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\
        = readdata(datap)

    # get words from relation names, update word dic
    memdata = getmemdata(entdic, worddic, chardic)

    # get glove and transform word mats to glove index space
    d2g, newdic, glove = getdic2glove(worddic, dim=embdim, trainfrac=embtrainfrac)
    traindata, validdata, testdata, memdata = \
        [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2)
         for x in [traindata, validdata, testdata, memdata]]

    print traindata.shape, testdata.shape
    #embed()

    numwords = max(worddic.values()) + 1    # don't use this, use glove
    numchars = max(chardic.values()) + 1
    numrels = max(entdic.values()) + 1

    if bidir:
        encinnerdim = [innerdim/2]*layers
    else:
        encinnerdim = [innerdim]*layers

    wordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac)
    rnn, lastdim = SimpleSeq2Vec.makernu(embdim+encdim, encinnerdim, bidir=bidir)
    enc = Seq2Vec(wordemb, rnn, maskid=-1)

    if mem:
        memembdim = embdim
        memencdim = encdim
        if membidir:
            innerdim = [innerdim/2]*memlayers
        else:
            innerdim = [innerdim]*memlayers
        if not sharedwordenc:
            memwordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1,
                                       embtrainfrac=embtrainfrac)
        else:
            memwordemb = wordemb
        memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim+memencdim, innerdim, bidir=membidir)
        memenc = Seq2Vec(memwordemb, memrnn, maskid=-1)
        if memaddr is None or memaddr == "dot":
            memaddr = DotMemAddr
        elif memaddr == "lin":
            memaddr = LinearGateMemAddr
        dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim)
    else:
        dec = SimpleVec2Idx(indim=innerdim, outdim=numrels)

    m = Seq2Idx(enc, dec)

    m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\
        .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\
        .train(numbats=numbats, epochs=epochs)

    pred = m.predict(testdata)
    print pred.shape
    evalres = evaluate(np.argmax(pred, axis=1), testgold)
    print str(evalres) + "%"
Exemplo n.º 12
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        hingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 1
        printpreds = True
        preeval = True
        # specemb = 100
        margin = 1.
        evalsplits = 1
        # usetypes=True
        mode = "charword"
        # checkdata = True

    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic \
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")
    # transform for predpred
    traingold = traingold[:, 1] - relstarts
    validgold = validgold[:, 1] - relstarts
    testgold = testgold[:, 1] - relstarts

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)
    predemb = VectorEmbed(numents - relstarts + 1, decdim, init="uniform")
    inpenc = SimpleSeq2Vec(inpemb=emb,
                           inpembdim=emb.outdim,
                           innerdim=encinnerdim,
                           maskid=maskid,
                           bidir=bidir,
                           layers=layers)

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    scorer = MatchScore(inpenc, predemb, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, (gold.shape[0], ))
            return datas, predrand.astype("int32")

    # embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
        .negsamplegen(NegIdxGen(numents - relstarts))\
        .negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    # eval
    canids = np.arange(start=0, stop=numents - relstarts)
    predembs = predemb.predict(canids)  # (numrels, embdim)
    tt.tick("evaluating")
    predencs = inpenc.predict(testdata)  # (batsize, embdim)
    scores = np.zeros((predencs.shape[0], predembs.shape[0]))
    for i in range(predencs.shape[0]):
        scores[i, :] = \
            scorer.s.predict(np.repeat(predencs[np.newaxis, i],
                                       predembs.shape[0], axis=0),
                             predembs)
        tt.progress(i, predencs.shape[0], live=True)
    best = np.argmax(scores, axis=1)
    sortedbest = [
        sorted(zip(np.arange(scores.shape[1]), list(scores[i])),
               reverse=True,
               key=lambda (x, y): y) for i in range(scores.shape[0])
    ]
    sortedbestmat = np.array([[x for (x, y) in z] for z in sortedbest],
                             dtype="int32")
    # MRR
    mrr = 0.0
    for i in range(sortedbestmat.shape[1]):
        mrr += np.sum(sortedbestmat[:, i] == testgold) * 1. / (i + 1)
    mrr /= testgold.shape[0]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]

    # R@X
    def ratx(ratnum):
        return rat(ratnum, sortedbestmat, testgold)

    def rat(ratnum, sortedpred, gold):
        acc = 0.0
        for i in range(min(ratnum, sortedbestmat.shape[1])):
            acc += 1.0 * np.sum(sortedpred[:, i] == gold)
        acc /= testgold.shape[0]
        return acc

    print "Accuracy: {}%".format(accuracy * 100)
    print "MRR: {}".format(mrr)
    print "Recall: @10: {}%\t @50: {}%\t @100: {}%".format(
        ratx(10) * 100,
        ratx(50) * 100,
        ratx(100) * 100)
    embed()

    tt.tock("evaluated")
Exemplo n.º 13
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(
        len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, :len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(indim=len(chars),
                          inpembdim=embdim,
                          innerdim=encdim / 2 if bidir else encdim,
                          maskid=-1,
                          bidir=bidir)
    dist = CosineDistance() if cosine else EuclideanDistance()  #DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)
    '''
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    '''
    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\
        .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\
        .objective(obj).adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    cwenc.save("glove2c2w.block")
Exemplo n.º 14
0
def run(epochs=50,
        numbats=700,
        lr=1.,
        wreg=0.000001,
        bidir=False,
        layers=1,
        embdim=200,
        encdim=400,
        decdim=400,
        negrate=1,
        margin=1.,
        hingeloss=False,
        debug=False,
        checkdata=False,
        predencode=False,
        closenegsam=False,
        glove=False,
        atleastcan=0,
        wordchar=False,
        charencmode="rnn",  # rnn or cnn
        totalrandomtest=False,
        rarewords=0,
        ):
    maskid = -1
    tt = ticktock("predpred")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar)

    if closenegsam:
        revsamplespace, revind = buildsamplespace(entmat, worddic)

    tt.tock("data loaded")
    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}
        def pp(widxs):
            print " ".join([rwd[x] if x in rwd else "" for x in widxs])
        embed()

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1

    if rarewords > 0:
        rwd = {v: k for k, v in worddic.items()}
        print "doing rare words"
        trainwordcounts = getmatrixvaluecounts(traindata, entmat)
        stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True)
        fstwc = filter(lambda (x, y): y > rarewords, stwc)
        redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd],
                           range(1, len(fstwc)+1)))
        redwdic["<RARE>"] = 0
        #embed()
    if bidir:
        encdim = [encdim / 2] * layers
    else:
        encdim = [encdim] * layers

    # question-side model
    if glove:
        if rarewords > 0:
            raise Exception("glove with rare words currently not supported")
        wordemb = Glove(embdim).adapt(worddic)
    else:
        if rarewords > 0:
            wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic)
            #embed()
        else:
            wordemb = WordEmb(dim=embdim, worddic=worddic)
    if wordchar:
        print "wordchar model"
        numchars = 256
        if charencmode == "cnn":
            print "using CNN char encoder"
            charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2,
                                    maskid=maskid, stride=1)
            wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim,
                                    innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE)
            question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb,
                                           l2enc=wordenc, maskid=maskid)
        else:
            question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim,
                                           wordemb=wordemb, wordinnerdim=encdim, maskid=maskid,
                                           bidir=bidir)
    else:
        question_enc = SimpleSeq2Vec(inpemb=wordemb,
                                     inpembdim=wordemb.outdim,
                                     innerdim=encdim,
                                     maskid=maskid,
                                     bidir=bidir,
                                     layers=layers)

    # predicate-side model
    if predencode:
        predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)
                         )
        predemb.load(entmat)
        """
        predemb = SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)

        class PreProc(object):
            def __init__(self, entmat):
                self.f = PreProcE(entmat)

            def __call__(self, encdata, decgold):
                return (encdata, self.f(decgold)[0][0]), {}

        class PreProcE(object):
            def __init__(self, entmat):
                self.em = Val(entmat)

            def __call__(self, x):
                return (self.em[x],), {}

        transf = PreProc(entmat)
        predtransf = transf.f
        """
    else:
        predemb = VectorEmbed(numents, decdim)
        """transf = None
        predtransf = None"""

    # scoring
    scorer = MatchScore(question_enc, predemb, scorer=CosineDistance())

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, gold.shape)
            return datas, predrand.astype("int32")

    class NegIdxGenClose(object):
        def __init__(self, revsamsp, rng):
            self.revsamsp = revsamsp
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            ret = np.zeros_like(gold)
            for i in range(gold.shape[0]):
                sampleset = self.revsamsp[gold[i]]
                if len(sampleset) > 5:
                    ret[i] = random.sample(sampleset, 1)[0]
                else:
                    ret[i] = np.random.randint(self.min, self.max)
            #embed()
            return datas, ret.astype("int32")


    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    if closenegsam:
        tt.msg("using close neg sampler")
        negidxgen = NegIdxGenClose(revsamplespace, numents)
    else:
        negidxgen = NegIdxGen(numents)

    checkembschange = True
    if checkembschange:
        #embed()
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        embvals = embvar.d.get_value()
    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
                .negsamplegen(negidxgen) \
                .negrate(negrate) \
                .objective(obj) \
                .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
                .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")
    if checkembschange:
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        newembvals = embvar.d.get_value()
        embschanged = not np.allclose(embvals, newembvals)
        sumsqdiff = np.sum((newembvals - embvals)**2)
        print "Embeddings {}: {} sum of square diffs"\
            .format("changed" if embschanged else "did not change", sumsqdiff)

    # evaluation
    tt.tick("evaluating")
    qenc_pred = question_enc.predict(testdata)
    scores = []
    dontembed = True
    if atleastcan > 0:
        print "ensuring at least {} cans".format(atleastcan)
    if totalrandomtest:
        print "total randomness"
    for i in range(qenc_pred.shape[0]):
        if totalrandomtest:
            cans = [testgold[i]]
        else:
            cans = testsubjsrels[i][0] #+ testsubjsrels[i][1]
        if len(cans) < atleastcan:
            extracans = list(np.random.randint(0, numents, (atleastcan+50,)))
            extracans = list(set(extracans).difference(set(cans)))
            cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))]
            #print len(cans), cans
        if not dontembed:
            embed()
        #cans = set(cans)
        #if atleastcan > 0:
        #    while len(cans) < atleastcan:
        #        rancan = np.random.randint(0, numents)
        #        if rancan not in cans:
        #            cans.add(rancan)
        #cans = list(cans)
        if len(cans) == 0:
            scores.append([(-1, -np.infty)])
            continue
        #canembs = predemb.predict.transform(predtransf)(cans)
        canembs = predemb.predict(cans)
        scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i],
                                             canembs.shape[0], axis=0),
                                   canembs)
        scores.append(zip(cans, scoresi))
        if debug:
            embed()
        tt.progress(i, qenc_pred.shape[0], live=True)
    sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores]
    best = [sortedbesti[0][0] for sortedbesti in sortedbest]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]


    print("Accuracy: {}%".format(accuracy * 100))
Exemplo n.º 15
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    balancednegidx=False,
    usetypes=False,
    evalsplits=50,
    relembrep=False,
):
    if debug:  # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        #preeval = True
        specemb = 100
        margin = 1.
        balancednegidx = True
        evalsplits = 1
        relembrep = True
        #usetypes=True
        #mode = "charword"
        #checkdata = True
    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")

    #embed()

    if subjpred is True and predpred is False:
        traingold = traingold[:, [0]]
        validgold = validgold[:, [0]]
        testgold = testgold[:, [0]]
    if predpred is True and subjpred is False:
        traingold = traingold[:, [1]]
        validgold = validgold[:, [1]]
        testgold = testgold[:, [1]]

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    entenc = EntEnc(
        SimpleSeq2Vec(indim=numwords,
                      inpembdim=memembdim,
                      innerdim=decinnerdim,
                      maskid=maskid,
                      bidir=membidir))

    numentembs = len(np.unique(entmat[:, 0]))
    if specids:  # include vectorembedder
        entenc = EntEmbEnc(entenc, numentembs, specemb)
    if relembrep:
        repsplit = entmat[relstarts, 0]
        entenc = EntEncRep(entenc, numentembs, repsplit)

        # adjust params for enc/dec construction
        #encinnerdim[-1] += specemb
        #innerdim[-1] += specemb

    encdec = SimpleSeqEncDecAtt(inpvocsize=numwords,
                                inpembdim=embdim,
                                encdim=encinnerdim,
                                bidir=bidir,
                                outembdim=entenc,
                                decdim=decinnerdim,
                                vecout=True,
                                statetrans="matdot")

    scorerargs = ([encdec, SeqUnroll(entenc)], {
        "argproc": lambda x, y, z: ((x, y), (z, )),
        "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim)
    })
    if sumhingeloss:
        scorerargs[1]["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1])

    #scorer.save("scorer.test.save")

    # TODO: below this line, check and test
    class PreProc(object):
        def __init__(self, entmat):
            self.f = PreProcE(entmat)

        def __call__(self, encdata, decsg,
                     decgold):  # gold: idx^(batsize, seqlen)
            return (encdata, self.f(decsg), self.f(decgold)), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            return self.em[x]

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit=None):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(
            self, datas, sgold, gold
        ):  # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together
            if self.midsplit is None or not balancednegidx:
                return datas, sgold, np.random.randint(
                    self.min, self.max, gold.shape).astype("int32")
            else:
                entrand = np.random.randint(self.min, self.midsplit,
                                            gold.shape)
                relrand = np.random.randint(self.midsplit, self.max,
                                            gold.shape)
                mask = np.random.randint(0, 2, gold.shape)
                ret = entrand * mask + relrand * (1 - mask)
                return datas, sgold, ret.astype("int32")

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:  #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    traingoldshifted = shiftdata(traingold)
    validgoldshifted = shiftdata(validgold)

    #embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
        eval = FullRankEval()
        pred, scores = s.decode(testdata,
                                testgold.shape[1],
                                candata=entmat,
                                canids=canids,
                                split=evalsplits,
                                transform=transf.f,
                                debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    negidxgenargs = ([numents], {"midsplit": relstarts})
    if debug:
        pass
        #negidxgenargs = ([numents], {})

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \
        .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgoldshifted, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    #scorer.save("scorer.test.save")

    # eval
    tt.tick("evaluating")
    s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
    eval = FullRankEval()
    pred, scores = s.decode(testdata,
                            testgold.shape[1],
                            candata=entmat,
                            canids=canids,
                            split=evalsplits,
                            transform=transf.f,
                            debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(1000):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Exemplo n.º 16
0
def run(
    negsammode="closest",  # "close" or "random"
    usetypes=True,
    mode="concat",  # "seq" or "concat" or "multi" or "multic" or "bino"
    glove=True,
    embdim=100,
    charencdim=100,
    charembdim=50,
    encdim=400,
    bidir=False,
    layers=1,
    charenc="rnn",  # "cnn" or "rnn"
    margin=0.5,
    lr=0.1,
    numbats=700,
    epochs=15,
    gradnorm=1.0,
    wreg=0.0001,
    loadmodel="no",
    debug=False,
    debugtest=False,
    forcesubjincl=False,
    randsameval=0,
    numtestcans=5,
    multiprune=-1,
    checkdata=False,
    testnegsam=False,
    testmodel=False,
    sepcharembs=False,
):
    tt = ticktock("script")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    (subjmat, relmat), (subjdic, reldic), worddic, \
    subjinfo, (testsubjcans, relsperent) = readdata(debug=debug,
                                                    numtestcans=numtestcans if numtestcans > 0 else None)

    if usetypes:
        print "building type matrix"
        typmat = buildtypmat(subjmat, subjinfo, worddic)
        subjmat = np.concatenate([typmat, subjmat], axis=1)
        typlen = typmat.shape[1]

    relsamplespace = None
    subjsamplespace = None
    if negsammode == "closest" or negsammode == "close":
        relsamplespace, revind = buildrelsamplespace(relmat, worddic)
        subjsamplespace = loadsubjsamplespace()
    tt.tock("data loaded")

    if checkdata:
        embed()

    numwords = max(worddic.values()) + 1
    numsubjs = max(subjdic.values()) + 1
    numrels = max(reldic.values()) + 1
    maskid = -1
    numchars = 256

    nsrelsperent = relsperent if negsammode == "closest" else None

    if testnegsam:
        nig = NegIdxGen(numsubjs - 1,
                        numrels - 1,
                        relclose=relsamplespace,
                        subjclose=subjsamplespace,
                        relsperent=nsrelsperent)
        embed()

    if mode == "seq" or mode == "multi":
        decdim = encdim
    elif mode == "concat" or mode == "multic" or mode == "bino":
        decdim = encdim / 2
    else:
        raise Exception("unrecognized mode")

    print "{} mode: {} decdim".format(mode, decdim)

    # defining model
    if glove:
        wordemb = Glove(embdim).adapt(worddic)
    else:
        wordemb = WordEmb(dim=embdim, indim=numwords)

    charemb = VectorEmbed(indim=numchars, dim=charembdim)
    charemb2 = VectorEmbed(indim=numchars, dim=charembdim)
    if charenc == "cnn":
        print "using CNN char encoder"
        charenc = CNNSeqEncoder(inpemb=charemb,
                                innerdim=[charencdim] * 2,
                                maskid=maskid,
                                stride=1)
    elif charenc == "rnn":
        print "using RNN char encoder"
        charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \
            .maskoptions(maskid, MaskMode.AUTO)
    else:
        raise Exception("no other character encoding modes available")

    if bidir:
        encdim = encdim / 2

    if mode != "bino":
        if mode == "multi" or mode == "multic":
            wordenc = \
                SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim,
                                   innerdim=encdim, bidir=bidir, numouts=2, mode="seq")
        else:
            encdim = [encdim] * layers
            wordenc = RNNSeqEncoder(inpemb=False,
                                    inpembdim=wordemb.outdim + charencdim,
                                    innerdim=encdim,
                                    bidir=bidir).maskoptions(MaskMode.NONE)

        question_encoder = TwoLevelEncoder(l1enc=charenc,
                                           l2emb=wordemb,
                                           l2enc=wordenc,
                                           maskid=maskid)

    else:
        question_encoder = BinoEncoder(charenc=charenc,
                                       wordemb=wordemb,
                                       maskid=maskid,
                                       scadim=100,
                                       encdim=encdim / 2,
                                       bidir=bidir,
                                       enclayers=layers,
                                       outdim=decdim,
                                       scabidir=True)

    # encode predicate on word level
    predemb = SimpleSeq2Vec(inpemb=wordemb,
                            innerdim=decdim,
                            maskid=maskid,
                            bidir=False,
                            layers=1)

    #predemb.load(relmat)

    scharemb = charemb2 if sepcharembs else charemb
    if usetypes:
        # encode subj type on word level
        subjtypemb = SimpleSeq2Vec(inpemb=wordemb,
                                   innerdim=int(np.ceil(decdim * 1. / 2)),
                                   maskid=maskid,
                                   bidir=False,
                                   layers=1)
        # encode subject on character level
        charbidir = True
        charencinnerdim = int(np.floor(decdim * 1. / 2))
        charenclayers = 1
        if charbidir:
            charencinnerdim /= 2
            charenclayers = 2
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=charencinnerdim,
                                maskid=maskid,
                                bidir=charbidir,
                                layers=charenclayers)
        subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb)
    else:
        # encode subject on character level
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=False,
                                layers=1)
    #subjemb.load(subjmat)
    if testmodel:
        embed()
    # package
    if mode == "seq":
        lb = SeqLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "concat":
        lb = ConcatLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "multi" or mode == "multic":
        lb = MultiLeftBlock(question_encoder, mode)
        rb = RightBlock(subjemb, predemb)
    elif mode == "bino":
        lb = question_encoder
        rb = RightBlock(subjemb, predemb)
    else:
        raise Exception("unrecognized mode")
    scorer = SeqMatchScore(lb,
                           rb,
                           scorer=CosineDistance(),
                           aggregator=lambda x: x,
                           argproc=lambda x, y, z: ((x, ), (y, z)))

    obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    class PreProc(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, data, gold):  # gold: idxs-(batsize, 2)
            st = self.ef(gold[:, 0])[0][0]
            rt = self.rf(gold[:, 1])[0][0]
            return (data, st, rt), {}

    class PreProcE(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, x):
            subjslice = self.ef(x[:, 0])[0][0]
            relslice = self.rf(x[:, 1])[0][0]
            return (subjslice, relslice), {}

    class PreProcEnt(object):
        def __init__(self, mat):
            self.entmat = Val(mat)

        def __call__(self, x):
            return (self.entmat[x], ), {}

    transf = PreProc(subjmat, relmat)

    if debug:
        embed()

    if epochs > 0 and loadmodel == "no":
        tt.tick("training")
        saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)])
        print("CHECKPOINTING AS: {}".format(saveid))
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numsubjs-1, numrels-1,
                                    relclose=relsamplespace,
                                    subjclose=subjsamplespace,
                                    relsperent=nsrelsperent)) \
            .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \
            .validate_on([validdata, validgold]) \
            .autosavethis(scorer, "fullrank{}.model".format(saveid)) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained").tick()

        # saving
        #scorer.save("fullrank{}.model".format(saveid))
        print("SAVED AS: {}".format(saveid))

    if loadmodel is not "no":
        tt.tick("loading model")
        m = SeqMatchScore.load("fullrank{}.model".format(loadmodel))
        #embed()
        lb = m.l
        subjemb = m.r.subjenc
        predemb = m.r.predenc
        tt.tock("loaded model")

    # evaluation
    predictor = CustomPredictor(
        questionencoder=lb,
        entityencoder=subjemb,
        relationencoder=predemb,
        #mode=mode,
        enttrans=transf.ef,
        reltrans=transf.rf,
        debug=debugtest,
        subjinfo=subjinfo)

    tt.tick("predicting")
    if forcesubjincl:  # forces the intended subject entity to be among candidates
        for i in range(len(testsubjcans)):
            if testgold[i, 0] not in testsubjcans[i]:
                testsubjcans[i].append(testgold[i, 0])

    if randsameval > 0:  # generate random sampling eval data
        testsubjcans = np.random.randint(0, numsubjs,
                                         (testgold.shape[0], randsameval))
        testrelcans = np.random.randint(0, numrels,
                                        (testgold.shape[0], randsameval))
        testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1)
        testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1)
        testsubjcans = testsubjcans.tolist()
        testrelcans = testrelcans.tolist()
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relcans=testrelcans)
    else:
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relsperent=relsperent,
                                       multiprune=multiprune)
    tt.tock("predicted")
    tt.tick("evaluating")
    evalmat = prediction == testgold
    subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0]
    predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0]
    totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0]
    print "Test results ::::::::::::::::"
    print "Total Acc: \t {}".format(totalacc)
    print "Subj Acc: \t {}".format(subjacc)
    print "Pred Acc: \t {}".format(predacc)
    tt.tock("evaluated")

    def subjinspect(subjrank, gold):
        ret = [
            (("GOLD - " if gold == x else "       ") + subjinfo[x][0] + " (" +
             " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels",
             y) if x in subjinfo else (x, y) for x, y in subjrank
        ]
        return ret

    def inspectboth(hidecorrect=False, hidenotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            predx = testgold[i, 1]
            subjrank = predictor.subjranks[i]
            predrank = predictor.relranks[i]
            if hidecorrect and subjx == subjrank[0][0] and predrank[0][
                    0] == predx:
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue

    def inspectsubjs(hidecorrect=False,
                     hidenotincan=False,
                     shownotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            subjrank = predictor.subjranks[i]
            if subjx == subjrank[0][0] and hidecorrect:  # only look for errors
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue
            if shownotincan and subjx in [k for k, v in subjrank]:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(
                    testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format(
                        *([
                            subjinfo[subjx][0], subjinfo[subjx][1],
                            subjinfo[subjx][3], subjinfo[subjx][2]
                        ] if subjx in
                          subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"])))
            inspres = subjinspect(subjrank, subjx)
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    def inspectpreds(hidecorrect=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.relranks)):
            relx = testgold[i, 1]
            subjx = testgold[i, 0]
            relrank = predictor.relranks[i]
            if relx == relrank[0][0] and hidecorrect:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(testdata[i, :, 0], rwd),
                wordids2string(relmat[relx, :], rwd))
            inspres = [(("GOLD - " if relx == x else "        ") +
                        wordids2string(relmat[x], rwd), y) for x, y in relrank]
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    embed()