def shared_dataset(data_xy, borrow=True):
        """ Function that loads the dataset into shared variables

        The reason we store our dataset in shared variables is to allow
        Theano to copy it into the GPU memory (when code is run on GPU).
        Since copying data into the GPU is slow, copying a minibatch everytime
        is needed (the default behaviour if the data is not in a shared
        variable) would lead to a large decrease in performance.
        """
        data_x, data_y = data_xy
        shared_x = sparse.shared(data_x.astype(theano.config.floatX),
                                 borrow=borrow)
        shared_y = theano.shared(np.asarray(data_y,
                                            dtype=theano.config.floatX),
                                 borrow=borrow)

        # one-hot encoded labels as {-1, 1}
        n_classes = len(np.unique(data_y))  # dangerous?
        y1 = -1 * np.ones((data_y.shape[0], n_classes))
        y1[np.arange(data_y.shape[0]), data_y] = 1
        shared_y1 = theano.shared(np.asarray(y1, dtype=theano.config.floatX),
                                  borrow=borrow)

        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets ous get around this issue
        return shared_x, T.cast(shared_y, 'int32'), T.cast(shared_y1, 'int32')
Пример #2
0
 def _build_mask(self):
     big_m = np.zeros((self.v_dim, self.v_dim), dtype=theano.config.floatX)
     k = 0
     for i in xrange(len(self.v_ranges)):
         for j in xrange(len(self.v_ranges[i])):
             big_m[k, self.v_ranges[i][j]] = 1
             k += 1
     # self.big_mask = theano.shared(big_m, name='big_mask')
     # Sparse mask
     self.big_mask = sparse.shared(sp.csc_matrix(big_m), name='big_mask')
Пример #3
0
def run(jobman, debug=False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_bow = T.matrix()
    s_idx = T.iscalar()
    s_tf = T.scalar()
    s_posit = T.matrix()  #theano.sparse.csr_matrix()
    s_negat = T.matrix()  #theano.sparse.csr_matrix()

    sentences = cPickle.load(
        open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(
        open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten(
        ).tolist()
    hashtab = dict(zip(gsubset, range(len(gsubset))))

    tfidf_data = numpy.load('/scratch/rifaisal/data/guten/guten_tfidf.npy'
                            ).item().tocsr().astype('float32')

    #tfidf = cPickle.load(open('/scratch/rifaisal/repos/senna/gutentokenizer.pkl'))

    senna = numpy.array(senna)[gsubset].tolist()
    s_valid = theano.sparse.csr_matrix()

    validsentence = sentences[10000:10010]

    nsent = len(sentences)
    nsenna = len(senna)

    # Layers

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity)

    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=T.tanh)
    L = logistic(i_size=hp['hsize'], h_size=1, act=identity)
    S = logistic(i_size=hp['embedsize'], h_size=nsenna, act=T.nnet.softmax)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna,
                                                 h_size=hp['embedsize'],
                                                 act=identity)
    valid_embedding.params['weights'] = sp.shared(
        value=scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(
            borrow=True)))
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape(
        (hp['nneg'], hp['embedsize'] * hp['wsize']))
    valid_embed = sp.dot(s_valid, valid_embedding.params['weights']).reshape(
        (nsenna, hp['embedsize'] * hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    s_bow_pred = S.encode(embedding.encode(s_bow))

    pred = s_tf * nllsoft(s_bow_pred, s_idx)

    CC = (rect(C)).mean() + hp['lambda'] * pred

    opt = theano.function(
        [s_posit, s_negat, s_bow, s_idx, s_tf], [(rect(C)).mean(), pred],
        updates=dict(
            S.update(CC, lr) + L.update(CC, lr) + H.update(CC, lr) +
            embedding.update_norm(CC, lr)))

    #validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding, fname + 'embedding.pkl')
        save(H, fname + 'hidden.pkl')
        save(L, fname + 'logistic.pkl')

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    freq_idx = cPickle.load(
        open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx = [hashtab[idx] for idx in freq_idx]

    fname = ''

    for e in range(hp['epoch']):
        c = []
        r = []
        count = 1
        for i in range(nsent):
            rsent = numpy.random.randint(nsent - 1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low=delta, high=nword - delta)
            pchunk = sentences[rsent][pidx - delta:pidx + delta + rest]
            nchunk = []
            st = sentences[rsent][pidx - delta:pidx]
            en = sentences[rsent][pidx + 1:pidx + delta + rest]
            rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st

            assert len(nchunk) == len(pchunk) * hp['nneg']
            tfidf_chunk = tfidf_data[rsent:rsent + 1].toarray()
            #pdb.set_trace()
            tfidf_value = tfidf_chunk[0, sentences[rsent][pidx]]
            tfidf_chunk[0, sentences[rsent][pidx]] = 0.
            tfidx = sentences[rsent][
                pidx]  # numpy.zeros(tfidf_chunk.shape).astype('float32')
            #tfidx[0,sentences[rsent][pidx]] = 1.
            p, n, b, iidx, tfval = (idx2mat(pchunk,
                                            nsenna), idx2mat(nchunk, nsenna),
                                    tfidf_chunk, tfidx, tfidf_value)
            count += tfval != 0
            l, g = opt(p, n, b, iidx, tfval)
            c = c
            c.append(l)
            r.append(g)
            """
            if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(20*hp['freq']) == 0 and debug==False:
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk
            """

            if (i + 1) % hp['freq'] == 0 or debug:
                hp['score'] = numpy.array(c).sum() / (numpy.array(c) > 0).sum()
                hp['pred'] = numpy.array(r).sum() / float(count)
                hp['e'] = e
                hp['i'] = i
                print ''
                print e, i, 'NN Score:', hp['score'], 'Reconstruction:', hp[
                    'pred']

                if debug != True:
                    ne = knn(
                        freq_idx,
                        embedding.params['e_weights'].get_value(borrow=True))
                    open('files/' + fname + 'nearest.txt',
                         'w').write(display(ne, senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()

    saveexp()
Пример #4
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_bow = T.matrix()
    s_posit = T.matrix()#theano.sparse.csr_matrix()
    s_negat = T.matrix()#theano.sparse.csr_matrix()

    sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist()
    hashtab = dict( zip( gsubset, range( len( gsubset))))    

    senna = numpy.array(senna)[gsubset].tolist()
    s_valid = theano.sparse.csr_matrix()

    validsentence = sentences[-10:]
    sentences = sentences[:-10]




    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = T.nnet.sigmoid)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh)
    L = logistic(i_size = hp['hsize'],  h_size = 1)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = T.nnet.sigmoid)
    valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    rec = embedding.reconstruct(s_bow, loss='ce')
    CC = (rect(C)).mean() + hp['lambda'] * rec

    opt = theano.function([s_posit, s_negat, s_bow], 
                          [C.mean(),rec], 
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update(CC,lr)) )

    validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')
        print 'Saved successfully'

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx =  [ hashtab[idx] for idx in freq_idx ]

    fname = ''
    
    for e in range(hp['epoch']):
        c = []
        r = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent-1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low = delta, high = nword-delta)
            pchunk = sentences[rsent][pidx-delta:pidx+delta+rest]
            nchunk = []
            st = sentences[rsent][pidx-delta:pidx]
            en = sentences[rsent][pidx+1:pidx+delta+rest]
            rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st


            assert len(nchunk) == len(pchunk)*hp['nneg']

            p, n, b = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna), idx2vec(sentences[rsent],nsenna))

            l,g = opt(p,n,b)
            c.append(l)
            r.append(g)
            
            if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(50*hp['freq']) == 0:
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                hp['e'] = e
                hp['i'] = i
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk

            if i%hp['freq'] == 0:
                hp['score'] = numpy.array(c).mean()
                hp['rec'] = numpy.array(r).mean()
                print e,i,'NN Score:', hp['score'], 'Reconstruction:', hp['rec']

                ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                open('files/'+fname+'nearest.txt','w').write(display(ne,senna))

                saveexp()
                sys.stdout.flush()
                jobman.save()
                
    save()
Пример #5
0
def run(jobman, debug=False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    idx_start = T.lscalar()
    idx_stop = T.lscalar()
    s_valid = theano.sparse.csr_matrix()

    w2i = cPickle.load(
        open(
            '/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl'
        ))
    i2w = dict((v, k) for k, v in w2i.iteritems())
    i2w[0] = 'UNK'
    senna = [i2w[i] for i in range(len(i2w.keys()))]

    nsenna = len(senna)

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=T.tanh)
    L = logistic(i_size=hp['hsize'], h_size=1, act=identity)

    del H.params['d_bias']
    del embedding.params['d_bias']
    del embedding.params['e_bias']
    minsize = hp['minsize']
    maxsize = hp['maxsize']

    dsize = maxsize - minsize + 1

    H.params['e_bias'] = theano.shared(numpy.array(numpy.zeros(
        (dsize, hp['hsize'])),
                                                   dtype=theano.config.floatX),
                                       name='e_bias')

    path = hp['loadpath']

    if path:
        load(embedding, path + '/embedding.pkl')
        #load(H,path+'/hidden.pkl')
        #load(L,path+'/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(
            borrow=True).shape[1]
        #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    H.params['e_bias'] = theano.shared(numpy.array(numpy.zeros(
        (dsize, hp['hsize'])),
                                                   dtype=theano.config.floatX),
                                       name='e_bias')
    valid_embedding = sparse.supervised.logistic(i_size=nsenna,
                                                 h_size=hp['embedsize'],
                                                 act=identity)
    valid_embedding.params['weights'] = sp.shared(
        value=scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(
            borrow=True)))

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape(
        (hp['nneg'], hp['embedsize'] * hp['wsize']))
    valid_embed = sp.dot(s_valid, valid_embedding.params['weights']).reshape(
        (nsenna, hp['embedsize'] * hp['wsize']))

    posit_embed_left = T.concatenate([
        posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(posit_embed[:, idx_stop * hp['embedsize']:])
    ],
                                     axis=1)

    negat_embed_left = T.concatenate([
        negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(negat_embed[:, idx_stop * hp['embedsize']:])
    ],
                                     axis=1)

    posit_embed_right = T.concatenate([
        T.zeros_like(posit_embed[:, :idx_start * hp['embedsize']]),
        posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']]
    ],
                                      axis=1)

    negat_embed_right = T.concatenate([
        T.zeros_like(negat_embed[:, :idx_start * hp['embedsize']]),
        negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']]
    ],
                                      axis=1)

    posit_embed = T.concatenate([
        T.zeros_like(posit_embed[:, :idx_start * hp['embedsize']]),
        posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(posit_embed[:, idx_stop * hp['embedsize']:])
    ],
                                axis=1)

    negat_embed = T.concatenate([
        T.zeros_like(negat_embed[:, :idx_start * hp['embedsize']]),
        negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(negat_embed[:, idx_stop * hp['embedsize']:])
    ],
                                axis=1)

    #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed)
    #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed)

    #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed)
    #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed)

    Hposit = T.tanh(
        T.dot(posit_embed, H.params['e_weights']) +
        H.params['e_bias'][idx_stop - idx_start - minsize, :])
    Hnegat = T.tanh(
        T.dot(negat_embed, H.params['e_weights']) +
        H.params['e_bias'][idx_stop - idx_start - minsize, :])
    posit_score = L.encode(Hposit)
    negat_score = L.encode(Hnegat)
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat, idx_start, idx_stop],
                          (rect(C)).mean(),
                          updates=dict(
                              L.update(CC, lr) + H.update(CC, lr) +
                              embedding.update_norm(CC, lr)))

    validfct = theano.function([s_valid], valid_score)

    def saveexp():
        save(embedding, fname + 'embedding.pkl')
        save(H, fname + 'hidden.pkl')
        save(L, fname + 'logistic.pkl')

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    freq_idx = cPickle.load(
        open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'
             ))[:2000]
    fname = ''
    validsentence = [
    ]  # cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl'))
    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(45)
        sentences = cPickle.load(
            open(
                '/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split'
                + str(hp['split']) + '.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []

        seen_words = 0
        for i, s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['maxsize'] + 2:
                continue
            rndsize = numpy.random.randint(low=hp['minsize'] + 1,
                                           high=hp['maxsize'] - 1)
            idxsta = numpy.random.randint(low=1, high=hp['maxsize'] - rndsize)
            idxsto = idxsta + rndsize

            print 'r', rndsize, 'b', idxsta, 'e', idxsto, 'shape', H.params[
                'e_bias'].get_value().shape

            c = []
            r = []
            if debug:
                print ' *** Processing document', i, 'with', nword,
                sys.stdout.flush()
            for j in range(delta, nword - delta):
                nd = rndsize / 2
                rd = rndsize % 2
                pchunk = s[j - delta:j + delta + rest]
                nchunk = []

                rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
                nchunk = []
                for kk in range(hp['nneg']):
                    tmpchunk = copy.copy(pchunk)
                    tmpchunk[idxsta + nd] = rndidx[kk]
                    nchunk += tmpchunk
                assert len(nchunk) == len(pchunk) * hp['nneg']
                p, n = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna))
                l = opt(p, n, idxsta, idxsto)
                c.append(l)

                if debug:
                    print '.',
                    break

            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if 0:  #(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']):
                tseenwords = 0
                valid_embedding.params['weights'] = sp.shared(
                    value=scipy.sparse.csr_matrix(
                        embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna,
                                       hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank', mrk

            if seen_words > hp['freq'] or debug:
                seen_words = 0
                hp['score'] = numpy.array(bigc).mean()
                hp['e'] = e
                hp['i'] = i
                print ''
                print e, i, 'NN Score:', hp['score']

                if not debug:
                    ne = knn(
                        freq_idx,
                        embedding.params['e_weights'].get_value(borrow=True))
                    open('files/' + fname + 'nearest.txt',
                         'w').write(display(ne, senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()

    saveexp()
Пример #6
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    idx_start = T.lscalar()
    idx_stop = T.lscalar()
    s_valid = theano.sparse.csr_matrix()



    w2i = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl'))
    i2w = dict( (v,k) for k,v in w2i.iteritems() )
    i2w[0] = 'UNK'
    senna = [ i2w[i] for i in range(len(i2w.keys())) ]


    nsenna = len(senna)
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh)
    L = logistic(i_size = hp['hsize'], h_size = 1, act = identity)

    del H.params['d_bias']
    del embedding.params['d_bias']
    del embedding.params['e_bias']
    minsize = hp['minsize']
    maxsize = hp['maxsize']

    dsize = maxsize - minsize +1

    H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias')

    path = hp['loadpath']
 
    if path:
        load(embedding,path+'/embedding.pkl')
        #load(H,path+'/hidden.pkl')
        #load(L,path+'/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1]
        #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias')
    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))


    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_embed_left = T.concatenate([posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                  T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    negat_embed_left = T.concatenate([negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                   T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    posit_embed_right = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]),
                                  posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1)

    negat_embed_right = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]),
                                   negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1)



    posit_embed = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]),
                                  posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                  T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    negat_embed = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]),
                                   negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                   T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    
    #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed)
    #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed)

    #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed)
    #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed)

    Hposit = T.tanh(T.dot(posit_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:])
    Hnegat = T.tanh(T.dot(negat_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:])
    posit_score = L.encode(Hposit)
    negat_score = L.encode(Hnegat)
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat, idx_start, idx_stop],
                          (rect(C)).mean(),
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'))[:2000]
    fname = ''
    validsentence = []# cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl'))
    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(45)
        sentences = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split'+str(hp['split'])+'.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []

        seen_words = 0
        for i,s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['maxsize'] + 2:
                continue
            rndsize = numpy.random.randint(low=hp['minsize']+1,high=hp['maxsize']-1)
            idxsta = numpy.random.randint(low=1, high=hp['maxsize']-rndsize)
            idxsto = idxsta+rndsize

            print 'r',rndsize,'b',idxsta,'e',idxsto,'shape',H.params['e_bias'].get_value().shape

            c =[]
            r =[]
            if debug:
                print ' *** Processing document',i,'with',nword,
                sys.stdout.flush()
            for j in range(delta,nword-delta):
                nd = rndsize/2
                rd = rndsize%2
                pchunk = s[j-delta:j+delta+rest]
                nchunk = []
                
                rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
                nchunk = []
                for kk in range(hp['nneg']):
                    tmpchunk = copy.copy(pchunk)
                    tmpchunk[idxsta+nd] = rndidx[kk]
                    nchunk += tmpchunk
                assert len(nchunk) == len(pchunk)*hp['nneg']
                p, n  = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
                l = opt(p,n, idxsta, idxsto)
                c.append(l)

                if debug:
                    print '.',
                    break


            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']):
                tseenwords = 0
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk


            if seen_words > hp['freq'] or debug:
                seen_words = 0
                hp['score'] = numpy.array(bigc).mean() 
                hp['e'] = e
                hp['i'] = i
                print ''
                print e,i,'NN Score:', hp['score']

                if not debug:
                    ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                    open('files/'+fname+'nearest.txt','w').write(display(ne,senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()
                
    saveexp()
Пример #7
0
    test_model_mat = ohe.transform(test_data["RESOURCE"].reshape((test_data.shape[0], 1)))

    train_model_mat = train_model_mat.astype(theano.config.floatX)
    test_model_mat = test_model_mat.astype(theano.config.floatX)

    n_train = 22000
    n_valid = 5000
    n_test = 5769

    train_i = np.zeros(n_train)
    valid_i = np.zeros(n_valid) + 1
    test_i = np.zeros(n_test) + 2

    perm = np.random.permutation(np.hstack([train_i, valid_i, test_i]))

    train_set_x = sparse.shared(train_model_mat[np.where(perm == 0)[0]])
    train_set_y = shared(train_data.ACTION[perm == 0].astype("int32"))
    valid_set_x = sparse.shared(train_model_mat[np.where(perm == 1)[0]])
    valid_set_y = shared(train_data.ACTION[perm == 1].astype("int32"))
    test_set_x = sparse.shared(train_model_mat[np.where(perm == 2)[0]])
    test_set_y = shared(train_data.ACTION[perm == 2].astype("int32"))

    datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)]

    rbm = test_rbm(datasets)

    # dbn = train_dbn(datasets, batch_size = 10, pretraining_epochs = 100, training_epochs = 1000)

    # pred_set_x = sparse.shared(test_model_mat)

    # pred_proba, _ = dbn.build_prediction_functions(pred_set_x, batch_size = 100)
Пример #8
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_bow = T.matrix()
    s_idx = T.iscalar()
    s_tf = T.scalar()
    s_posit = T.matrix()#theano.sparse.csr_matrix()
    s_negat = T.matrix()#theano.sparse.csr_matrix()

    sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist()
    hashtab = dict( zip( gsubset, range( len( gsubset))))    

    tfidf_data = numpy.load('/scratch/rifaisal/data/guten/guten_tfidf.npy').item().tocsr().astype('float32')

    #tfidf = cPickle.load(open('/scratch/rifaisal/repos/senna/gutentokenizer.pkl'))

    senna = numpy.array(senna)[gsubset].tolist()
    s_valid = theano.sparse.csr_matrix()

    validsentence = sentences[10000:10010]


    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)

    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh)
    L = logistic(i_size = hp['hsize'], h_size = 1, act = identity)
    S = logistic(i_size = hp['embedsize'], h_size = nsenna, act= T.nnet.softmax)


    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    s_bow_pred = S.encode(embedding.encode(s_bow))


    pred = s_tf * nllsoft(s_bow_pred,s_idx)
    
    CC = (rect(C)).mean() + hp['lambda'] * pred

    opt = theano.function([s_posit, s_negat, s_bow, s_idx, s_tf], 
                          [(rect(C)).mean(),pred], 
                          updates = dict( S.update(CC,lr) + L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    #validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx =  [ hashtab[idx] for idx in freq_idx ]

    fname = ''
    
    for e in range(hp['epoch']):
        c = []
        r = []
        count = 1
        for i in range(nsent):
            rsent = numpy.random.randint(nsent-1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low = delta, high = nword-delta)
            pchunk = sentences[rsent][pidx-delta:pidx+delta+rest]
            nchunk = []
            st = sentences[rsent][pidx-delta:pidx]
            en = sentences[rsent][pidx+1:pidx+delta+rest]
            rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st


            assert len(nchunk) == len(pchunk)*hp['nneg']
            tfidf_chunk = tfidf_data[rsent:rsent+1].toarray()
            #pdb.set_trace()
            tfidf_value = tfidf_chunk[0,sentences[rsent][pidx]]
            tfidf_chunk[0,sentences[rsent][pidx]] = 0.
            tfidx = sentences[rsent][pidx] # numpy.zeros(tfidf_chunk.shape).astype('float32')
            #tfidx[0,sentences[rsent][pidx]] = 1.
            p, n, b, iidx, tfval = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna), tfidf_chunk, tfidx, tfidf_value )
            count += tfval!=0
            l,g = opt(p,n,b, iidx, tfval)
            c = c
            c.append(l)
            r.append(g)

            """
            if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(20*hp['freq']) == 0 and debug==False:
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk
            """

            if (i+1)%hp['freq'] == 0 or debug:
                hp['score'] = numpy.array(c).sum() / (numpy.array(c)>0).sum()
                hp['pred'] = numpy.array(r).sum()/float(count)
                hp['e'] = e
                hp['i'] = i
                print ''
                print e,i,'NN Score:', hp['score'], 'Reconstruction:', hp['pred']

                if debug != True:
                    ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                    open('files/'+fname+'nearest.txt','w').write(display(ne,senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()
                
    saveexp()
Пример #9
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    s_valid = theano.sparse.csr_matrix()

    #vocab = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    #senna = cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/WestburyLab.wikicorp.201004_vocab30k.pkl'))
    w2i = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/merged_word2idx.pkl'))
    i2w = dict( (v,k) for k,v in w2i.iteritems() )
    i2w[0] = 'UNK'
    senna = [ i2w[i] for i in range(len(i2w.keys())) ]

    nsenna = len(senna)
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh)
    L = logistic(i_size = hp['hsize'], h_size = 1, act = identity)
 
    path = hp['loadpath']
 
    if path:
        load(embedding,path+'/embedding.pkl')
        load(H,path+'/hidden.pkl')
        load(L,path+'/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1]
        hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat],
                          (rect(C)).mean(),
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    #validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')


    delta = hp['wsize']/2
    rest = hp['wsize']%2
    #freq_idx = range(29000,30000)
    freq_idx = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/sorted_vocab.pkl'))[:2000]
    fname = ''
    #validsentence = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/valid.pkl'))
    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(45)
        sentences = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/split'+str(hp['split'])+'.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []

        seen_words = 0
        for i,s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['wsize'] + 2:
                continue
            c =[]
            r =[]
            if debug:
                print ' *** Processing document',i,'with',nword,
                sys.stdout.flush()
            for j in range(delta,nword-delta):
                pchunk = s[j-delta:j+delta+rest]
                nchunk = []
                st = s[j-delta:j]
                en = s[j+1:j+delta+rest]
                rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
                nchunk = []
                for kk in range(hp['nneg']):
                    nchunk += st + [rndidx[kk]] + en

                assert len(nchunk) == len(pchunk)*hp['nneg']
                p, n  = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
                l = opt(p,n)
                c.append(l)

                if debug:
                    print '.',
                    break


            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']):
                tseenwords = 0
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk


            if seen_words > hp['freq'] or debug:
                seen_words = 0
                hp['score'] = numpy.array(bigc).mean() 
                hp['e'] = e
                hp['i'] = i
                print ''
                print e,i,'NN Score:', hp['score']

                if not debug:
                    ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                    open('files/'+fname+'nearest.txt','w').write(display(ne,senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()
                
    saveexp()