Exemplo n.º 1
0
 def test_shape(self):
     batsize = 10
     seqlen = 3
     ldim = 5
     rdim = 5
     l = np.random.random((batsize, ldim))
     r = np.random.random((batsize, seqlen, rdim))
     b = CosineDistance()
     pred, extra = b.predict(l, r, _extra_outs=["lnorms", "rnorms"])
     print extra
     print pred
     self.assertEqual(pred.shape, (batsize, seqlen))
     self.assertTrue(np.all((pred - np.ones_like(pred)) < 0))
Exemplo n.º 2
0
 def test_shapes(self):
     batsize, seqlen = 100, 7
     criterionshape = (batsize, 10)
     datashape = (batsize, seqlen, 10)
     attgen = AttGen(CosineDistance())
     # generate data
     criterion = np.random.random(criterionshape)
     data = np.random.random(datashape)
     # predict and test
     pred = attgen.predict(criterion, data)
     self.assertEqual(pred.shape, (batsize, seqlen))
     self.assertTrue(np.allclose(pred.sum(axis=1), np.ones((pred.shape[0],))))
Exemplo n.º 3
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        vdata = np.arange(num)
        negrate = 5

        def obj(p, n):
            return n - p
        m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\
            .adagrad(lr=0.1).objective(obj) \
            .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \
            .train(numbats=50, epochs=29, returnerrors=True)
        #.writeresultstofile("testingresultswriter.tsv") \

        tdata = np.arange(num)
        tt = ticktock("eval")
        tt.tick()
        mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata)
        tt.tock("evaluated test data")
        print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
        print verr
        self.assertTrue(
            np.allclose(np.asarray([mrr, recat1, recat10]),
                        np.asarray(verr[-1][1:])))
Exemplo n.º 4
0
    def test_seq_scoring(self):
        vocsize = 100
        dim = 10
        numsam = 17
        seqlen = 5
        ve = VectorEmbed(vocsize, dim)
        m = SeqMatchScore(SeqUnroll(ve), SeqUnroll(ve), scorer=CosineDistance())

        data = np.random.randint(0, vocsize, (numsam, seqlen))
        #print data.shape
        pred = m.predict(data, data)
        #print pred
        self.assertTrue(np.allclose(np.ones_like(pred)*seqlen*1., pred))
Exemplo n.º 5
0
 def test_mask(self):
     batsize, seqlen = 100, 7
     criterionshape = (batsize, 10)
     datashape = (batsize, seqlen, 10)
     attgen = AttGen(CosineDistance())
     # generate data
     criterion = np.random.random(criterionshape)
     data = np.random.random(datashape)
     mask = np.ones((batsize, seqlen))
     maskids = np.random.randint(2, seqlen+1, (batsize,))
     for i in range(maskids.shape[0]):
         mask[i, maskids[i]:] = 0
     # predict and test
     pred = attgen.predict(criterion, data, mask)
     maskthrough = np.not_equal(pred, 0)
     self.assertTrue(np.all(maskthrough == mask))
Exemplo n.º 6
0
    def __init__(self,
                 inpvocsize=None,
                 inpembdim=None,
                 inpemb=None,
                 inpencinnerdim=None,
                 bidir=False,
                 maskid=None,
                 dropout=False,
                 rnu=GRU,
                 inpencoder=None,
                 memvocsize=None,
                 memembdim=None,
                 memembmat=None,
                 memencinnerdim=None,
                 memencoder=None,
                 inp_att_dist=CosineDistance(),
                 mem_att_dist=CosineDistance(),
                 inp_attention=None,
                 mem_attention=None,
                 coredims=None,
                 corernu=GRU,
                 core=None,
                 explicit_interface=False,
                 scalaraggdim=None,
                 write_value_dim=None,
                 nsteps=100,
                 posvecdim=None,
                 mem_pos_repr=None,
                 inp_pos_repr=None,
                 inp_addr_extractor=None,
                 mem_addr_extractor=None,
                 write_addr_extractor=None,
                 write_addr_generator=None,
                 write_addr_dist=CosineDistance(),
                 write_value_generator=None,
                 write_value_extractor=None,
                 mem_erase_generator=None,
                 mem_change_generator=None,
                 memsampler=None,
                 memsamplemethod=None,
                 memsampletemp=0.3,
                 **kw):

        # INPUT ENCODING
        if inpencoder is None:
            inpencoder = SeqEncoder.RNN(indim=inpvocsize,
                                        inpembdim=inpembdim,
                                        inpemb=inpemb,
                                        innerdim=inpencinnerdim,
                                        bidir=bidir,
                                        maskid=maskid,
                                        dropout_in=dropout,
                                        dropout_h=dropout,
                                        rnu=rnu).all_outputs()
            lastinpdim = inpencinnerdim if not issequence(
                inpencinnerdim) else inpencinnerdim[-1]
        else:
            lastinpdim = inpencoder.block.layers[-1].innerdim

        # MEMORY ENCODING
        if memembmat is None:
            memembmat = param((memvocsize, memembdim),
                              name="memembmat").glorotuniform()
        if memencoder is None:
            memencoder = SeqEncoder.RNN(inpemb=False,
                                        innerdim=memencinnerdim,
                                        bidir=bidir,
                                        dropout_in=dropout,
                                        dropout_h=dropout,
                                        rnu=rnu,
                                        inpembdim=memembdim).all_outputs()
            lastmemdim = memencinnerdim if not issequence(
                memencinnerdim) else memencinnerdim[-1]
        else:
            lastmemdim = memencoder.block.layers[-1].innerdim

        # POSITION VECTORS
        if posvecdim is not None and inp_pos_repr is None:
            inp_pos_repr = RNNWithoutInput(posvecdim, dropout=dropout)
        if posvecdim is not None and mem_pos_repr is None:
            mem_pos_repr = RNNWithoutInput(posvecdim, dropout=dropout)

        xtra_dim = posvecdim if posvecdim is not None else 0
        # CORE RNN - THE THINKER
        if core is None:
            corelayers, _ = MakeRNU.fromdims(
                [lastinpdim + lastmemdim + xtra_dim * 2] + coredims,
                rnu=corernu,
                dropout_in=dropout,
                dropout_h=dropout,
                param_init_states=True)
            core = RecStack(*corelayers)

        lastcoredim = core.get_statespec()[-1][0][1][0]

        # ATTENTIONS
        if mem_attention is None:
            mem_attention = Attention(mem_att_dist)
        if inp_attention is None:
            inp_attention = Attention(inp_att_dist)
        if write_addr_generator is None:
            write_addr_generator = AttGen(write_addr_dist)

        # WRITE VALUE
        if write_value_generator is None:
            write_value_generator = WriteValGenerator(write_value_dim,
                                                      memvocsize,
                                                      dropout=dropout)

        # MEMORY SAMPLER
        if memsampler is not None:
            assert (memsamplemethod is None)
        if memsamplemethod is not None:
            assert (memsampler is None)
            memsampler = GumbelSoftmax(temperature=memsampletemp)

        ################ STATE INTERFACES #################

        if not explicit_interface:
            if inp_addr_extractor is None:
                inp_addr_extractor = Forward(lastcoredim,
                                             lastinpdim + xtra_dim,
                                             dropout=dropout)
            if mem_addr_extractor is None:
                inp_addr_extractor = Forward(lastcoredim,
                                             lastmemdim + xtra_dim,
                                             dropout=dropout)

            # WRITE INTERFACE
            if write_addr_extractor is None:
                write_addr_extractor = Forward(lastcoredim,
                                               lastmemdim + xtra_dim,
                                               dropout=dropout)
            if write_value_extractor is None:
                write_value_extractor = Forward(lastcoredim,
                                                write_value_dim,
                                                dropout=dropout)

            # MEM UPDATE INTERFACE
            if mem_erase_generator is None:
                mem_erase_generator = StateToScalar(lastcoredim, scalaraggdim)
            if mem_change_generator is None:
                mem_change_generator = StateToScalar(lastcoredim, scalaraggdim)
        else:
            inp_addr_extractor, mem_addr_extractor, write_addr_extractor, \
            write_value_extractor, mem_erase_generator, mem_change_generator = \
                make_vector_slicers(0, lastinpdim + xtra_dim, lastmemdim + xtra_dim,
                                    lastmemdim + xtra_dim, write_value_dim, 1, 1)

        super(SimpleBulkNN,
              self).__init__(inpencoder=inpencoder,
                             memembmat=memembmat,
                             memencoder=memencoder,
                             inp_attention=inp_attention,
                             mem_attention=mem_attention,
                             core=core,
                             memsampler=memsampler,
                             nsteps=nsteps,
                             inp_addr_extractor=inp_addr_extractor,
                             mem_addr_extractor=mem_addr_extractor,
                             write_addr_extractor=write_addr_extractor,
                             write_addr_generator=write_addr_generator,
                             mem_erase_generator=mem_erase_generator,
                             mem_change_generator=mem_change_generator,
                             write_value_generator=write_value_generator,
                             write_value_extractor=write_value_extractor,
                             inp_pos_repr=inp_pos_repr,
                             mem_pos_repr=mem_pos_repr,
                             **kw)
Exemplo n.º 7
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        preeval = True
        #specemb = 100
        margin = 1.
        evalsplits = 1
        #usetypes=True
        #mode = "charword"
        #checkdata = True
    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)

    subjenc = EntEnc(
        SimpleSeq2Vec(invocsize=numwords,
                      inpembdim=embdim,
                      innerdim=decinnerdim,
                      maskid=maskid,
                      bidir=membidir))

    numentembs = len(np.unique(entmat[:, 0]))
    repsplit = entmat[relstarts, 0]
    if specids:  # include vectorembedder
        subjenc = EntEmbEnc(subjenc, numentembs, specemb)
    predenc = VectorEmbed(indim=numents - relstarts + 1,
                          dim=subjenc.outdim,
                          init="zero")
    entenc = CustomEntEnc(subjenc, predenc, repsplit)

    inpenc = CustomSeq2Pair(inpemb=emb,
                            encdim=encinnerdim,
                            scadim=encinnerdim,
                            enclayers=layers,
                            scalayers=layers,
                            bidir=bidir,
                            maskid=maskid,
                            outdim=subjenc.outdim)

    # adjust params for enc/dec construction
    # encinnerdim[-1] += specemb
    # innerdim[-1] += specemb

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    if sumhingeloss:
        scorerkwargs["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(self, datas, gold):
            entrand = np.random.randint(self.min, self.midsplit,
                                        (gold.shape[0], 1))
            relrand = np.random.randint(self.midsplit, self.max,
                                        (gold.shape[0], 1))
            ret = np.concatenate([entrand, relrand], axis=1)
            return datas, ret.astype("int32")

    #embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:  #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    # embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = CustomRankSearch(inpenc,
                             entenc,
                             scorer.s,
                             scorer.agg,
                             relstarts=relstarts)
        eval = FullRankEval()
        pred, scores = s.search(testdata,
                                testgold.shape[1],
                                candata=entmat,
                                canids=canids,
                                split=evalsplits,
                                transform=transf.f,
                                debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    if not loadmodel:
        tt.tick("training")
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \
            .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
            .validate_on([validdata, validgold]) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained")
        scorer.save("customfullrank.scorer.save")
    else:
        scorer = SeqMatchScore.load("customfullrank.scorer.save")

    # eval
    tt.tick("evaluating")

    s = CustomRankSearch(inpenc,
                         entenc,
                         scorer.s,
                         scorer.agg,
                         relstarts=relstarts)
    eval = FullRankEval()
    pred, scores = s.search(testdata,
                            testgold.shape[1],
                            candata=entmat,
                            canids=canids,
                            split=evalsplits,
                            transform=transf.f,
                            debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(1000):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Exemplo n.º 8
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        hingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 1
        printpreds = True
        preeval = True
        # specemb = 100
        margin = 1.
        evalsplits = 1
        # usetypes=True
        mode = "charword"
        # checkdata = True

    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic \
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")
    # transform for predpred
    traingold = traingold[:, 1] - relstarts
    validgold = validgold[:, 1] - relstarts
    testgold = testgold[:, 1] - relstarts

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)
    predemb = VectorEmbed(numents - relstarts + 1, decdim, init="uniform")
    inpenc = SimpleSeq2Vec(inpemb=emb,
                           inpembdim=emb.outdim,
                           innerdim=encinnerdim,
                           maskid=maskid,
                           bidir=bidir,
                           layers=layers)

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    scorer = MatchScore(inpenc, predemb, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, (gold.shape[0], ))
            return datas, predrand.astype("int32")

    # embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
        .negsamplegen(NegIdxGen(numents - relstarts))\
        .negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    # eval
    canids = np.arange(start=0, stop=numents - relstarts)
    predembs = predemb.predict(canids)  # (numrels, embdim)
    tt.tick("evaluating")
    predencs = inpenc.predict(testdata)  # (batsize, embdim)
    scores = np.zeros((predencs.shape[0], predembs.shape[0]))
    for i in range(predencs.shape[0]):
        scores[i, :] = \
            scorer.s.predict(np.repeat(predencs[np.newaxis, i],
                                       predembs.shape[0], axis=0),
                             predembs)
        tt.progress(i, predencs.shape[0], live=True)
    best = np.argmax(scores, axis=1)
    sortedbest = [
        sorted(zip(np.arange(scores.shape[1]), list(scores[i])),
               reverse=True,
               key=lambda (x, y): y) for i in range(scores.shape[0])
    ]
    sortedbestmat = np.array([[x for (x, y) in z] for z in sortedbest],
                             dtype="int32")
    # MRR
    mrr = 0.0
    for i in range(sortedbestmat.shape[1]):
        mrr += np.sum(sortedbestmat[:, i] == testgold) * 1. / (i + 1)
    mrr /= testgold.shape[0]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]

    # R@X
    def ratx(ratnum):
        return rat(ratnum, sortedbestmat, testgold)

    def rat(ratnum, sortedpred, gold):
        acc = 0.0
        for i in range(min(ratnum, sortedbestmat.shape[1])):
            acc += 1.0 * np.sum(sortedpred[:, i] == gold)
        acc /= testgold.shape[0]
        return acc

    print "Accuracy: {}%".format(accuracy * 100)
    print "MRR: {}".format(mrr)
    print "Recall: @10: {}%\t @50: {}%\t @100: {}%".format(
        ratx(10) * 100,
        ratx(50) * 100,
        ratx(100) * 100)
    embed()

    tt.tock("evaluated")
Exemplo n.º 9
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(
        len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, :len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(indim=len(chars),
                          inpembdim=embdim,
                          innerdim=encdim / 2 if bidir else encdim,
                          maskid=-1,
                          bidir=bidir)
    dist = CosineDistance() if cosine else EuclideanDistance()  #DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)
    '''
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    '''
    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\
        .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\
        .objective(obj).adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    cwenc.save("glove2c2w.block")
Exemplo n.º 10
0
def run(epochs=50,
        numbats=700,
        lr=1.,
        wreg=0.000001,
        bidir=False,
        layers=1,
        embdim=200,
        encdim=400,
        decdim=400,
        negrate=1,
        margin=1.,
        hingeloss=False,
        debug=False,
        checkdata=False,
        predencode=False,
        closenegsam=False,
        glove=False,
        atleastcan=0,
        wordchar=False,
        charencmode="rnn",  # rnn or cnn
        totalrandomtest=False,
        rarewords=0,
        ):
    maskid = -1
    tt = ticktock("predpred")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar)

    if closenegsam:
        revsamplespace, revind = buildsamplespace(entmat, worddic)

    tt.tock("data loaded")
    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}
        def pp(widxs):
            print " ".join([rwd[x] if x in rwd else "" for x in widxs])
        embed()

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1

    if rarewords > 0:
        rwd = {v: k for k, v in worddic.items()}
        print "doing rare words"
        trainwordcounts = getmatrixvaluecounts(traindata, entmat)
        stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True)
        fstwc = filter(lambda (x, y): y > rarewords, stwc)
        redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd],
                           range(1, len(fstwc)+1)))
        redwdic["<RARE>"] = 0
        #embed()
    if bidir:
        encdim = [encdim / 2] * layers
    else:
        encdim = [encdim] * layers

    # question-side model
    if glove:
        if rarewords > 0:
            raise Exception("glove with rare words currently not supported")
        wordemb = Glove(embdim).adapt(worddic)
    else:
        if rarewords > 0:
            wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic)
            #embed()
        else:
            wordemb = WordEmb(dim=embdim, worddic=worddic)
    if wordchar:
        print "wordchar model"
        numchars = 256
        if charencmode == "cnn":
            print "using CNN char encoder"
            charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2,
                                    maskid=maskid, stride=1)
            wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim,
                                    innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE)
            question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb,
                                           l2enc=wordenc, maskid=maskid)
        else:
            question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim,
                                           wordemb=wordemb, wordinnerdim=encdim, maskid=maskid,
                                           bidir=bidir)
    else:
        question_enc = SimpleSeq2Vec(inpemb=wordemb,
                                     inpembdim=wordemb.outdim,
                                     innerdim=encdim,
                                     maskid=maskid,
                                     bidir=bidir,
                                     layers=layers)

    # predicate-side model
    if predencode:
        predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)
                         )
        predemb.load(entmat)
        """
        predemb = SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)

        class PreProc(object):
            def __init__(self, entmat):
                self.f = PreProcE(entmat)

            def __call__(self, encdata, decgold):
                return (encdata, self.f(decgold)[0][0]), {}

        class PreProcE(object):
            def __init__(self, entmat):
                self.em = Val(entmat)

            def __call__(self, x):
                return (self.em[x],), {}

        transf = PreProc(entmat)
        predtransf = transf.f
        """
    else:
        predemb = VectorEmbed(numents, decdim)
        """transf = None
        predtransf = None"""

    # scoring
    scorer = MatchScore(question_enc, predemb, scorer=CosineDistance())

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, gold.shape)
            return datas, predrand.astype("int32")

    class NegIdxGenClose(object):
        def __init__(self, revsamsp, rng):
            self.revsamsp = revsamsp
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            ret = np.zeros_like(gold)
            for i in range(gold.shape[0]):
                sampleset = self.revsamsp[gold[i]]
                if len(sampleset) > 5:
                    ret[i] = random.sample(sampleset, 1)[0]
                else:
                    ret[i] = np.random.randint(self.min, self.max)
            #embed()
            return datas, ret.astype("int32")


    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    if closenegsam:
        tt.msg("using close neg sampler")
        negidxgen = NegIdxGenClose(revsamplespace, numents)
    else:
        negidxgen = NegIdxGen(numents)

    checkembschange = True
    if checkembschange:
        #embed()
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        embvals = embvar.d.get_value()
    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
                .negsamplegen(negidxgen) \
                .negrate(negrate) \
                .objective(obj) \
                .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
                .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")
    if checkembschange:
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        newembvals = embvar.d.get_value()
        embschanged = not np.allclose(embvals, newembvals)
        sumsqdiff = np.sum((newembvals - embvals)**2)
        print "Embeddings {}: {} sum of square diffs"\
            .format("changed" if embschanged else "did not change", sumsqdiff)

    # evaluation
    tt.tick("evaluating")
    qenc_pred = question_enc.predict(testdata)
    scores = []
    dontembed = True
    if atleastcan > 0:
        print "ensuring at least {} cans".format(atleastcan)
    if totalrandomtest:
        print "total randomness"
    for i in range(qenc_pred.shape[0]):
        if totalrandomtest:
            cans = [testgold[i]]
        else:
            cans = testsubjsrels[i][0] #+ testsubjsrels[i][1]
        if len(cans) < atleastcan:
            extracans = list(np.random.randint(0, numents, (atleastcan+50,)))
            extracans = list(set(extracans).difference(set(cans)))
            cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))]
            #print len(cans), cans
        if not dontembed:
            embed()
        #cans = set(cans)
        #if atleastcan > 0:
        #    while len(cans) < atleastcan:
        #        rancan = np.random.randint(0, numents)
        #        if rancan not in cans:
        #            cans.add(rancan)
        #cans = list(cans)
        if len(cans) == 0:
            scores.append([(-1, -np.infty)])
            continue
        #canembs = predemb.predict.transform(predtransf)(cans)
        canembs = predemb.predict(cans)
        scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i],
                                             canembs.shape[0], axis=0),
                                   canembs)
        scores.append(zip(cans, scoresi))
        if debug:
            embed()
        tt.progress(i, qenc_pred.shape[0], live=True)
    sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores]
    best = [sortedbesti[0][0] for sortedbesti in sortedbest]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]


    print("Accuracy: {}%".format(accuracy * 100))
Exemplo n.º 11
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\
            .adagrad(lr=0.1)\
            .train(numbats=50, epochs=50)

        print m.predict([num, num - 1, num - 2, num - 1],
                        [num, num - 1, num - 2, num - 2])

        mrr = 0.0
        recat10 = 0.0
        recat1 = 0.0
        tot = num + 1
        for a in range(tot):
            abc = zip(range(num + 1),
                      list(m.predict([a] * (num + 1), np.arange(0, num + 1))))
            abc = sorted(abc, key=lambda (x, y): y, reverse=True)
            #print abc[:10]
            for i in range(len(abc)):
                if abc[i][0] == a:
                    #print i
                    mrr += 1. / (1 + i)
                    if i < 10:
                        recat10 += 1
                    if i < 1:
                        recat1 += 1
                    break

        mrr /= tot
        recat10 /= tot
        recat1 /= tot
        print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
Exemplo n.º 12
0
    def __init__(self,
                 inpvocsize=400,
                 inpembdim=None,
                 inpemb=None,
                 outvocsize=100,
                 outembdim=None,
                 outemb=None,
                 encdim=100,
                 decdim=100,
                 bidir=False,
                 rnu=GRU,
                 statetrans=None,
                 vecout=None,
                 inconcat=True,
                 outconcat=False,
                 maskid=-1,
                 dropout=False,
                 attdist=CosineDistance(),
                 sepatt=False,
                 encoder=None,
                 decoder=None,
                 attention=None,
                 **kw):
        self.encinnerdim = [encdim] if not issequence(encdim) else encdim
        self.decinnerdim = [decdim] if not issequence(decdim) else decdim
        self.dropout = dropout

        # encoder
        if encoder is None:
            if sepatt:
                enc = self._getencoder_sepatt(indim=inpvocsize,
                                              inpembdim=inpembdim,
                                              inpemb=inpemb,
                                              innerdim=self.encinnerdim,
                                              bidir=bidir,
                                              maskid=maskid,
                                              dropout_in=dropout,
                                              dropout_h=dropout,
                                              rnu=rnu)
            else:
                enc = self._getencoder(indim=inpvocsize,
                                       inpembdim=inpembdim,
                                       inpemb=inpemb,
                                       innerdim=self.encinnerdim,
                                       bidir=bidir,
                                       maskid=maskid,
                                       dropout_in=dropout,
                                       dropout_h=dropout,
                                       rnu=rnu)
        else:
            enc = encoder

        if attention is None:
            attention = self._getattention(attdist, sepatt=sepatt)

        self.lastencinnerdim = enc.outdim
        if decoder is None:
            dec = self._getdecoder(outvocsize=outvocsize,
                                   outembdim=outembdim,
                                   outemb=outemb,
                                   maskid=maskid,
                                   attention=attention,
                                   lastencinnerdim=self.lastencinnerdim,
                                   decinnerdim=self.decinnerdim,
                                   inconcat=inconcat,
                                   outconcat=outconcat,
                                   softmaxout=vecout,
                                   dropout=dropout,
                                   rnu=rnu)
        else:
            dec = decoder

        self.lastdecinnerdim = self.decinnerdim[-1]
        self.statetrans_setting = statetrans
        statetrans = self._build_state_trans(self.statetrans_setting)

        super(SimpleSeqEncDecAtt, self).__init__(enc,
                                                 dec,
                                                 statetrans=statetrans,
                                                 **kw)
Exemplo n.º 13
0
def run(
    negsammode="closest",  # "close" or "random"
    usetypes=True,
    mode="concat",  # "seq" or "concat" or "multi" or "multic" or "bino"
    glove=True,
    embdim=100,
    charencdim=100,
    charembdim=50,
    encdim=400,
    bidir=False,
    layers=1,
    charenc="rnn",  # "cnn" or "rnn"
    margin=0.5,
    lr=0.1,
    numbats=700,
    epochs=15,
    gradnorm=1.0,
    wreg=0.0001,
    loadmodel="no",
    debug=False,
    debugtest=False,
    forcesubjincl=False,
    randsameval=0,
    numtestcans=5,
    multiprune=-1,
    checkdata=False,
    testnegsam=False,
    testmodel=False,
    sepcharembs=False,
):
    tt = ticktock("script")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    (subjmat, relmat), (subjdic, reldic), worddic, \
    subjinfo, (testsubjcans, relsperent) = readdata(debug=debug,
                                                    numtestcans=numtestcans if numtestcans > 0 else None)

    if usetypes:
        print "building type matrix"
        typmat = buildtypmat(subjmat, subjinfo, worddic)
        subjmat = np.concatenate([typmat, subjmat], axis=1)
        typlen = typmat.shape[1]

    relsamplespace = None
    subjsamplespace = None
    if negsammode == "closest" or negsammode == "close":
        relsamplespace, revind = buildrelsamplespace(relmat, worddic)
        subjsamplespace = loadsubjsamplespace()
    tt.tock("data loaded")

    if checkdata:
        embed()

    numwords = max(worddic.values()) + 1
    numsubjs = max(subjdic.values()) + 1
    numrels = max(reldic.values()) + 1
    maskid = -1
    numchars = 256

    nsrelsperent = relsperent if negsammode == "closest" else None

    if testnegsam:
        nig = NegIdxGen(numsubjs - 1,
                        numrels - 1,
                        relclose=relsamplespace,
                        subjclose=subjsamplespace,
                        relsperent=nsrelsperent)
        embed()

    if mode == "seq" or mode == "multi":
        decdim = encdim
    elif mode == "concat" or mode == "multic" or mode == "bino":
        decdim = encdim / 2
    else:
        raise Exception("unrecognized mode")

    print "{} mode: {} decdim".format(mode, decdim)

    # defining model
    if glove:
        wordemb = Glove(embdim).adapt(worddic)
    else:
        wordemb = WordEmb(dim=embdim, indim=numwords)

    charemb = VectorEmbed(indim=numchars, dim=charembdim)
    charemb2 = VectorEmbed(indim=numchars, dim=charembdim)
    if charenc == "cnn":
        print "using CNN char encoder"
        charenc = CNNSeqEncoder(inpemb=charemb,
                                innerdim=[charencdim] * 2,
                                maskid=maskid,
                                stride=1)
    elif charenc == "rnn":
        print "using RNN char encoder"
        charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \
            .maskoptions(maskid, MaskMode.AUTO)
    else:
        raise Exception("no other character encoding modes available")

    if bidir:
        encdim = encdim / 2

    if mode != "bino":
        if mode == "multi" or mode == "multic":
            wordenc = \
                SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim,
                                   innerdim=encdim, bidir=bidir, numouts=2, mode="seq")
        else:
            encdim = [encdim] * layers
            wordenc = RNNSeqEncoder(inpemb=False,
                                    inpembdim=wordemb.outdim + charencdim,
                                    innerdim=encdim,
                                    bidir=bidir).maskoptions(MaskMode.NONE)

        question_encoder = TwoLevelEncoder(l1enc=charenc,
                                           l2emb=wordemb,
                                           l2enc=wordenc,
                                           maskid=maskid)

    else:
        question_encoder = BinoEncoder(charenc=charenc,
                                       wordemb=wordemb,
                                       maskid=maskid,
                                       scadim=100,
                                       encdim=encdim / 2,
                                       bidir=bidir,
                                       enclayers=layers,
                                       outdim=decdim,
                                       scabidir=True)

    # encode predicate on word level
    predemb = SimpleSeq2Vec(inpemb=wordemb,
                            innerdim=decdim,
                            maskid=maskid,
                            bidir=False,
                            layers=1)

    #predemb.load(relmat)

    scharemb = charemb2 if sepcharembs else charemb
    if usetypes:
        # encode subj type on word level
        subjtypemb = SimpleSeq2Vec(inpemb=wordemb,
                                   innerdim=int(np.ceil(decdim * 1. / 2)),
                                   maskid=maskid,
                                   bidir=False,
                                   layers=1)
        # encode subject on character level
        charbidir = True
        charencinnerdim = int(np.floor(decdim * 1. / 2))
        charenclayers = 1
        if charbidir:
            charencinnerdim /= 2
            charenclayers = 2
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=charencinnerdim,
                                maskid=maskid,
                                bidir=charbidir,
                                layers=charenclayers)
        subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb)
    else:
        # encode subject on character level
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=False,
                                layers=1)
    #subjemb.load(subjmat)
    if testmodel:
        embed()
    # package
    if mode == "seq":
        lb = SeqLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "concat":
        lb = ConcatLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "multi" or mode == "multic":
        lb = MultiLeftBlock(question_encoder, mode)
        rb = RightBlock(subjemb, predemb)
    elif mode == "bino":
        lb = question_encoder
        rb = RightBlock(subjemb, predemb)
    else:
        raise Exception("unrecognized mode")
    scorer = SeqMatchScore(lb,
                           rb,
                           scorer=CosineDistance(),
                           aggregator=lambda x: x,
                           argproc=lambda x, y, z: ((x, ), (y, z)))

    obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    class PreProc(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, data, gold):  # gold: idxs-(batsize, 2)
            st = self.ef(gold[:, 0])[0][0]
            rt = self.rf(gold[:, 1])[0][0]
            return (data, st, rt), {}

    class PreProcE(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, x):
            subjslice = self.ef(x[:, 0])[0][0]
            relslice = self.rf(x[:, 1])[0][0]
            return (subjslice, relslice), {}

    class PreProcEnt(object):
        def __init__(self, mat):
            self.entmat = Val(mat)

        def __call__(self, x):
            return (self.entmat[x], ), {}

    transf = PreProc(subjmat, relmat)

    if debug:
        embed()

    if epochs > 0 and loadmodel == "no":
        tt.tick("training")
        saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)])
        print("CHECKPOINTING AS: {}".format(saveid))
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numsubjs-1, numrels-1,
                                    relclose=relsamplespace,
                                    subjclose=subjsamplespace,
                                    relsperent=nsrelsperent)) \
            .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \
            .validate_on([validdata, validgold]) \
            .autosavethis(scorer, "fullrank{}.model".format(saveid)) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained").tick()

        # saving
        #scorer.save("fullrank{}.model".format(saveid))
        print("SAVED AS: {}".format(saveid))

    if loadmodel is not "no":
        tt.tick("loading model")
        m = SeqMatchScore.load("fullrank{}.model".format(loadmodel))
        #embed()
        lb = m.l
        subjemb = m.r.subjenc
        predemb = m.r.predenc
        tt.tock("loaded model")

    # evaluation
    predictor = CustomPredictor(
        questionencoder=lb,
        entityencoder=subjemb,
        relationencoder=predemb,
        #mode=mode,
        enttrans=transf.ef,
        reltrans=transf.rf,
        debug=debugtest,
        subjinfo=subjinfo)

    tt.tick("predicting")
    if forcesubjincl:  # forces the intended subject entity to be among candidates
        for i in range(len(testsubjcans)):
            if testgold[i, 0] not in testsubjcans[i]:
                testsubjcans[i].append(testgold[i, 0])

    if randsameval > 0:  # generate random sampling eval data
        testsubjcans = np.random.randint(0, numsubjs,
                                         (testgold.shape[0], randsameval))
        testrelcans = np.random.randint(0, numrels,
                                        (testgold.shape[0], randsameval))
        testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1)
        testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1)
        testsubjcans = testsubjcans.tolist()
        testrelcans = testrelcans.tolist()
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relcans=testrelcans)
    else:
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relsperent=relsperent,
                                       multiprune=multiprune)
    tt.tock("predicted")
    tt.tick("evaluating")
    evalmat = prediction == testgold
    subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0]
    predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0]
    totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0]
    print "Test results ::::::::::::::::"
    print "Total Acc: \t {}".format(totalacc)
    print "Subj Acc: \t {}".format(subjacc)
    print "Pred Acc: \t {}".format(predacc)
    tt.tock("evaluated")

    def subjinspect(subjrank, gold):
        ret = [
            (("GOLD - " if gold == x else "       ") + subjinfo[x][0] + " (" +
             " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels",
             y) if x in subjinfo else (x, y) for x, y in subjrank
        ]
        return ret

    def inspectboth(hidecorrect=False, hidenotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            predx = testgold[i, 1]
            subjrank = predictor.subjranks[i]
            predrank = predictor.relranks[i]
            if hidecorrect and subjx == subjrank[0][0] and predrank[0][
                    0] == predx:
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue

    def inspectsubjs(hidecorrect=False,
                     hidenotincan=False,
                     shownotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            subjrank = predictor.subjranks[i]
            if subjx == subjrank[0][0] and hidecorrect:  # only look for errors
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue
            if shownotincan and subjx in [k for k, v in subjrank]:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(
                    testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format(
                        *([
                            subjinfo[subjx][0], subjinfo[subjx][1],
                            subjinfo[subjx][3], subjinfo[subjx][2]
                        ] if subjx in
                          subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"])))
            inspres = subjinspect(subjrank, subjx)
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    def inspectpreds(hidecorrect=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.relranks)):
            relx = testgold[i, 1]
            subjx = testgold[i, 0]
            relrank = predictor.relranks[i]
            if relx == relrank[0][0] and hidecorrect:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(testdata[i, :, 0], rwd),
                wordids2string(relmat[relx, :], rwd))
            inspres = [(("GOLD - " if relx == x else "        ") +
                        wordids2string(relmat[x], rwd), y) for x, y in relrank]
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    embed()