예제 #1
0
    def init(self):
        #memory
        wencpg = WordEncoderPlusGlove(numchars=self.numchars, numwords=self.numwords, encdim=self.wordencdim, embdim=self.wordembdim, embtrainfrac=0.0, glovepath=self.glovepath)
        self.memenco = SeqEncoder(
            wencpg,
            GRU(dim=self.wordembdim + self.wordencdim, innerdim=self.encinnerdim)
        )

        entemb = VectorEmbed(indim=self.outdim, dim=self.entembdim)
        self.mempayload = ConcatBlock(entemb, self.memenco)
        self.memblock = MemoryBlock(self.mempayload, self.memdata, indim=self.outdim, outdim=self.encinnerdim+self.entembdim)

        #encoder
        wencpg2 = WordEncoderPlusGlove(numchars=self.numchars, numwords=self.numwords, encdim=self.wordencdim, embdim=self.wordembdim, embtrainfrac=0.0, glovepath=self.glovepath)
        self.enc = SeqEncoder(
            wencpg2,
            GRU(dim=self.wordembdim + self.wordencdim, innerdim=self.encinnerdim)
        )

        #decoder
        entemb2 = VectorEmbed(indim=self.outdim, dim=self.entembdim)
        self.softmaxoutblock = stack(self.memaddr(self.memblock, indim=self.decinnerdim, memdim=self.memblock.outdim, attdim=self.attdim), Softmax())
        self.dec = SeqDecoder(
            [entemb2,  #self.memblock,
             GRU(dim=entemb.outdim + self.encinnerdim, innerdim=self.decinnerdim),             # GRU(dim=self.memblock.outdim + self.encinnerdim, innerdim=self.decinnerdim),
             ],
            inconcat=True,
            innerdim=self.decinnerdim,
            softmaxoutblock=self.softmaxoutblock
        )
예제 #2
0
 def test_word_encoder_output_shape(self):
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     batsize = 111
     seqlen = 13
     wordlen = 37
     numchars = 200
     numwords = 1000
     worddata = np.random.randint(0, numwords, (batsize, 1))
     chardata = np.random.randint(0, numchars, (batsize, wordlen))
     data = np.concatenate([worddata, chardata], axis=1)
     encdim = 100
     embdim = 50
     block = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim)
     pred = block.predict(data)
     self.assertEqual(pred.shape, (batsize, encdim+embdim))
예제 #3
0
 def test_char_auto_mask(self):
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     batsize = 11
     seqlen = 13
     wordlen = 3
     blank = 2
     numchars = 20
     numwords = 100
     worddata = np.random.randint(0, numwords, (batsize, 1))
     chardata = np.random.randint(0, numchars, (batsize, wordlen))
     blank = np.zeros((batsize, blank)).astype("int32") - 1
     data = np.concatenate([worddata, chardata, blank], axis=1)
     encdim = 4
     embdim = 50
     block = WordEncoderPlusGlove(numchars=numchars,
                                  numwords=numwords,
                                  encdim=encdim,
                                  embdim=embdim,
                                  maskid=-1)
     block.enc.enc.all_outputs()
     pred = block.enc.enc.predict(data[:, 1:])
     i = 1
     while i < pred.shape[1]:
         self.assertEqual(np.allclose(pred[:, i - 1, :], pred[:, i, :]),
                          i >= wordlen)
         i += 1
예제 #4
0
    def __init__(self,
                 wordembdim=50,
                 wordencdim=100,
                 innerdim=200,
                 outdim=1e4,
                 numwords=4e5,
                 numchars=128,
                 glovepath=None,
                 **kw):
        super(FBBasicCompositeEncoder, self).__init__(**kw)
        self.indim = wordembdim + wordencdim
        self.outdim = outdim
        self.wordembdim = wordembdim
        self.wordencdim = wordencdim
        self.innerdim = innerdim

        self.enc = SeqEncoder(
            WordEncoderPlusGlove(numchars=numchars,
                                 numwords=numwords,
                                 encdim=self.wordencdim,
                                 embdim=self.wordembdim,
                                 embtrainfrac=0.0,
                                 glovepath=glovepath),
            GRU(dim=self.wordembdim + self.wordencdim, innerdim=self.innerdim))

        self.out = Lin(indim=self.innerdim, dim=self.outdim)
예제 #5
0
 def test_word_encoder_output_shape(self):
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     batsize = 111
     seqlen = 13
     wordlen = 37
     numchars = 200
     numwords = 1000
     worddata = np.random.randint(0, numwords, (batsize, 1))
     chardata = np.random.randint(0, numchars, (batsize, wordlen))
     data = np.concatenate([worddata, chardata], axis=1)
     encdim = 100
     embdim = 50
     block = WordEncoderPlusGlove(numchars=numchars,
                                  numwords=numwords,
                                  encdim=encdim,
                                  embdim=embdim)
     pred = block.predict(data)
     self.assertEqual(pred.shape, (batsize, encdim + embdim))
예제 #6
0
    def __init__(
            self,
            entembdim=50,
            wordembdim=50,
            wordencdim=100,
            memdata=None,
            attdim=100,
            numchars=128,  # number of different chars
            numwords=4e5,  # number of different words
            glovepath=None,
            innerdim=100,  # dim of memory payload encoder output
            outdim=1e4,  # number of entities
            memaddr=DotMemAddr,
            **kw):
        super(FBMemMatch, self).__init__(**kw)
        self.wordembdim = wordembdim
        self.wordencdim = wordencdim
        self.entembdim = entembdim
        self.attdim = attdim
        self.encinnerdim = innerdim
        self.outdim = outdim

        memaddr = TransDotMemAddr

        # memory encoder per word
        #wencpg = WordEmbed(indim=numwords, outdim=self.wordembdim, trainfrac=1.0)
        wordencoder = WordEncoderPlusGlove(numchars=numchars,
                                           numwords=numwords,
                                           encdim=self.wordencdim,
                                           embdim=self.wordembdim,
                                           embtrainfrac=0.0,
                                           glovepath=glovepath)

        # memory encoder for one cell
        self.phraseencoder = SeqEncoder(
            wordencoder,
            GRU(dim=self.wordembdim + self.wordencdim,
                innerdim=self.encinnerdim))
        # entity embedder
        entemb = VectorEmbed(indim=self.outdim, dim=self.entembdim)
        self.entembs = entemb(
            memdata[0])  #Val(np.arange(0, self.outdim, dtype="int32")))
        # memory block
        self.mempayload = self.phraseencoder  #ConcatBlock(entemb, self.phraseencoder)
        self.memblock = MemoryBlock(
            self.mempayload,
            memdata[1],
            indim=self.outdim,
            outdim=self.encinnerdim)  # + self.entembdim)
        # memory addressing
        self.mema = memaddr(self.memblock,
                            memdim=self.memblock.outdim,
                            attdim=attdim,
                            indim=self.encinnerdim)
예제 #7
0
    def init(self):
        #MEMORY: encodes how entity is written + custom entity embeddings
        wencpg = WordEncoderPlusGlove(numchars=self.numchars,
                                      numwords=self.numwords,
                                      encdim=self.wordencdim,
                                      embdim=self.wordembdim,
                                      embtrainfrac=0.0,
                                      glovepath=self.glovepath)
        self.memenco = SeqEncoder(
            wencpg,
            GRU(dim=self.wordembdim + self.wordencdim,
                innerdim=self.encinnerdim))

        entemb = VectorEmbed(indim=self.outdim, dim=self.entembdim)
        self.mempayload = ConcatBlock(entemb, self.memenco)
        self.memblock = MemoryBlock(self.mempayload,
                                    self.memdata,
                                    indim=self.outdim,
                                    outdim=self.encinnerdim + self.entembdim)

        #ENCODER: uses the same language encoder as memory
        #wencpg2 = WordEncoderPlusGlove(numchars=self.numchars, numwords=self.numwords, encdim=self.wordencdim, embdim=self.wordembdim, embtrainfrac=0.0, glovepath=glovepath)
        self.enc = RecStack(
            wencpg,
            GRU(dim=self.wordembdim + self.wordencdim,
                innerdim=self.encinnerdim))

        #ATTENTION
        attgen = LinearGateAttentionGenerator(indim=self.encinnerdim +
                                              self.decinnerdim,
                                              innerdim=self.attdim)
        attcon = WeightedSumAttCon()

        #DECODER
        #entemb2 = VectorEmbed(indim=self.outdim, dim=self.entembdim)
        self.softmaxoutblock = stack(
            self.memaddr(self.memblock,
                         indim=self.decinnerdim + self.encinnerdim,
                         memdim=self.memblock.outdim,
                         attdim=self.attdim), Softmax())

        self.dec = SeqDecoder([
            self.memblock,
            GRU(dim=self.entembdim + self.encinnerdim,
                innerdim=self.decinnerdim)
        ],
                              outconcat=True,
                              inconcat=False,
                              attention=Attention(attgen, attcon),
                              innerdim=self.decinnerdim + self.encinnerdim,
                              softmaxoutblock=self.softmaxoutblock)
예제 #8
0
    def test_auto_mask_within_seq2vec(self):
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        batsize = 11
        seqlen = 3
        seqblank = 2
        wordlen = 3
        wordblank = 2
        numchars = 20
        numwords = 100
        encdim = 4
        embdim = 50
        innerdim = 2

        worddata = np.random.randint(0, numwords, (batsize, seqlen, 1))
        worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1
        worddata = np.concatenate([worddata, worddatablank], axis=1)
        chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen))
        charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblank], axis=2)
        charblankblank = np.zeros(
            (batsize, seqblank, wordlen + wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblankblank], axis=1)
        data = np.concatenate([worddata, chardata], axis=2)

        wordemb = WordEncoderPlusGlove(numchars=numchars,
                                       numwords=numwords,
                                       encdim=encdim,
                                       embdim=embdim,
                                       maskid=-1,
                                       embtrainfrac=0)
        rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim,
                                             innerdim,
                                             bidir=False)
        enc = Seq2Vec(wordemb, rnn, maskid=-1)
        enc.enc.with_outputs()
        finalpred, pred = enc.predict(data)
        #print pred.shape, finalpred.shape
        #print pred[0], finalpred[0]
        i = 1
        while i < pred.shape[1]:
            self.assertEqual(np.allclose(pred[:, i - 1, :], pred[:, i, :]),
                             i >= seqlen)
            i += 1
예제 #9
0
    def __init__(self,
                 wordembdim=50,
                 wordencdim=50,
                 entembdim=200,
                 innerdim=200,
                 attdim=100,
                 outdim=1e4,
                 numwords=4e5,
                 numchars=128,
                 glovepath=None,
                 **kw):
        super(FBSeqCompEncDecAtt, self).__init__(**kw)
        self.indim = wordembdim + wordencdim
        self.outdim = outdim
        self.wordembdim = wordembdim
        self.wordencdim = wordencdim
        self.encinnerdim = innerdim
        self.entembdim = entembdim
        self.decinnerdim = innerdim

        self.wordencoder = WordEncoderPlusGlove(numchars=numchars,
                                                numwords=numwords,
                                                encdim=self.wordencdim,
                                                embdim=self.wordembdim,
                                                embtrainfrac=0.0,
                                                glovepath=glovepath)

        self.rnn = RecStack(
            self.wordencoder,
            GRU(dim=wordembdim + wordencdim, innerdim=self.encinnerdim))
        attgen = LinearGateAttentionGenerator(indim=self.encinnerdim +
                                              self.decinnerdim,
                                              innerdim=attdim)
        attcon = WeightedSumAttCon()
        self.dec = SeqDecoder([
            VectorEmbed(indim=self.outdim, dim=self.entembdim),
            GRU(dim=self.entembdim, innerdim=self.decinnerdim)
        ],
                              attention=Attention(attgen, attcon),
                              outconcat=True,
                              inconcat=False,
                              innerdim=self.encinnerdim + self.decinnerdim)
예제 #10
0
    def __init__(self, wordembdim=50, wordencdim=100, entembdim=200, innerdim=200, outdim=1e4, numwords=4e5, numchars=128, glovepath=None, **kw):
        super(FBSeqCompositeEncDec, self).__init__(**kw)
        self.indim = wordembdim + wordencdim
        self.outdim = outdim
        self.wordembdim = wordembdim
        self.wordencdim = wordencdim
        self.encinnerdim = innerdim
        self.entembdim = entembdim
        self.decinnerdim = innerdim

        self.enc = SeqEncoder(
            WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=self.wordencdim, embdim=self.wordembdim, embtrainfrac=0.0, glovepath=glovepath),
            GRU(dim=self.wordembdim + self.wordencdim, innerdim=self.encinnerdim)
        )

        self.dec = SeqDecoder(
            [VectorEmbed(indim=self.outdim, dim=self.entembdim), GRU(dim=self.entembdim+self.encinnerdim, innerdim=self.decinnerdim)],
            inconcat=True,
            innerdim=self.decinnerdim,
        )
예제 #11
0
def run(epochs=10,
        numbats=100,
        numsam=10000,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.wordchar.pkl",
        embdim=50,
        encdim=50,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        keepmincount=5,
        sameenc=False,
        memaddr="dot",
        memattdim=100,
        layers=1,
        embtrainfrac=0.0,
        mem=False,
        membidir=False,
        memlayers=1,
        sharedwordenc=False):
    """ Memory match-based glove-based word-level relation classification """

    (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\
        = readdata(datap)

    # get words from relation names, update word dic
    memdata = getmemdata(entdic, worddic, chardic)

    # get glove and transform word mats to glove index space
    d2g, newdic, glove = getdic2glove(worddic,
                                      dim=embdim,
                                      trainfrac=embtrainfrac)
    traindata, validdata, testdata, memdata = \
        [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2)
         for x in [traindata, validdata, testdata, memdata]]

    print traindata.shape, testdata.shape
    #embed()

    numwords = max(worddic.values()) + 1  # don't use this, use glove
    numchars = max(chardic.values()) + 1
    numrels = max(entdic.values()) + 1

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    wordemb = WordEncoderPlusGlove(numchars=numchars,
                                   encdim=encdim,
                                   embdim=embdim,
                                   maskid=-1,
                                   embtrainfrac=embtrainfrac)
    rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim,
                                         encinnerdim,
                                         bidir=bidir)
    enc = Seq2Vec(wordemb, rnn, maskid=-1)

    if mem:
        memembdim = embdim
        memencdim = encdim
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        if not sharedwordenc:
            memwordemb = WordEncoderPlusGlove(numchars=numchars,
                                              encdim=encdim,
                                              embdim=embdim,
                                              maskid=-1,
                                              embtrainfrac=embtrainfrac)
        else:
            memwordemb = wordemb
        memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim + memencdim,
                                                   innerdim,
                                                   bidir=membidir)
        memenc = Seq2Vec(memwordemb, memrnn, maskid=-1)
        if memaddr is None or memaddr == "dot":
            memaddr = DotMemAddr
        elif memaddr == "lin":
            memaddr = LinearGateMemAddr
        dec = MemVec2Idx(memenc,
                         memdata,
                         memdim=innerdim,
                         memaddr=memaddr,
                         memattdim=memattdim)
    else:
        dec = SimpleVec2Idx(indim=innerdim, outdim=numrels)

    m = Seq2Idx(enc, dec)

    m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\
        .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\
        .train(numbats=numbats, epochs=epochs)

    pred = m.predict(testdata)
    print pred.shape
    evalres = evaluate(np.argmax(pred, axis=1), testgold)
    print str(evalres) + "%"