示例#1
0
def train(dim_word=100,  # word vector dimensionality
          ctx_dim=512,  # context vector dimensionality
          dim=1000,  # the number of LSTM units
          attn_type='stochastic',  # [see section 4 from paper]
          n_layers_att=1,  # number of layers used to compute the attention weights
          n_layers_out=1,  # number of layers used to compute logit
          n_layers_lstm=1,  # number of lstm layers
          n_layers_init=1,  # number of layers to initialize LSTM at time 0
          lstm_encoder=False,  # if True, run bidirectional LSTM on input units
          prev2out=False,  # Feed previous word into logit
          ctx2out=False,  # Feed attention weighted ctx into logit
          alpha_entropy_c=0.002,  # hard attn param
          RL_sumCost=True,  # hard attn param
          semi_sampling_p=0.5,  # hard attn param
          temperature=1.,  # hard attn param
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,  # weight decay coeff
          alpha_c=0.,  # doubly stochastic coeff
          lrate=0.01,  # used only for SGD
          selector=False,  # selector (see paper)
          n_words=10000,  # vocab size
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size = 16,
          valid_batch_size = 16,
          saveto='model.npz',  # relative path of saved model file
          validFreq=1000,
          saveFreq=1000,  # save the parameters after every saveFreq updates
          sampleFreq=100,  # generate some samples after every sampleFreq updates
          data_path='./data',  # path to find data
          dataset='flickr8k',
          dictionary=None,  # word dictionary
          use_dropout=False,  # setting this true turns on dropout at various points
          use_dropout_lstm=False,  # dropout on lstm gates
          reload_=False,
          save_per_epoch=False): # this saves down the model every epoch

    # hyperparam dict
    model_options = locals().copy()
    model_options = validate_options(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        print "Reloading options"
        with open('%s.pkl'%saveto, 'rb') as f:
            model_options = pkl.load(f)

    print "Using the following parameters:"
    print  model_options

    print 'Loading data'
    load_data, prepare_data = get_dataset(dataset)
    train, valid, test, worddict = load_data(path=data_path)
    if dataset == 'coco':
        valid, _ = valid # the second one contains all the validation data

    # index 0 and 1 always code for the end of sentence and unknown token
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Initialize (or reload) the parameters using 'model_options'
    # then build the Theano graph
    print 'Building model'
    params = init_params(model_options)
    if reload_ and os.path.exists(saveto):
        print "Reloading model"
        params = load_params(saveto, params)

    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    # In order, we get:
    #   1) trng - theano random number generator
    #   2) use_noise - flag that turns on dropout
    #   3) inps - inputs for f_grad_shared
    #   4) cost - log likelihood for each sentence
    #   5) opts_out - optional outputs (e.g selector)
    trng, use_noise, \
          inps, alphas, alphas_sample,\
          cost, \
          opt_outs = \
          build_model(tparams, model_options)


    # To sample, we use beam search: 1) f_init is a function that initializes
    # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over
    # words and also the new "initial state/memory" see equation
    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # we want the cost without any the regularizers
    # define the log probability
    f_log_probs = theano.function(inps, -cost, profile=False,
                                        updates=opt_outs['attn_updates']
                                        if model_options['attn_type']=='stochastic'
                                        else None, allow_input_downcast=True)

    # Define the cost function + Regularization
    cost = cost.mean()
    # add L2 regularization costs
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # Doubly stochastic regularization
    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean()
        cost += alpha_reg

    hard_attn_updates = []
    # Backprop!
    if model_options['attn_type'] == 'deterministic':
        grads = tensor.grad(cost, wrt=itemlist(tparams))
    else:
        # shared variables for hard attention
        baseline_time = theano.shared(numpy.float32(0.), name='baseline_time')
        opt_outs['baseline_time'] = baseline_time
        alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c')
        alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean()
        # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule]
        if model_options['RL_sumCost']:
            grads = tensor.grad(cost, wrt=itemlist(tparams),
                                disconnected_inputs='raise',
                                known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.*
                                            (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)})
        else:
            grads = tensor.grad(cost, wrt=itemlist(tparams),
                            disconnected_inputs='raise',
                            known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.*
                            (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)})
        # [equation on bottom left of page 5]
        hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())]
        # updates from scan
        hard_attn_updates += opt_outs['attn_updates']

    # to getthe cost after regularization or the gradients, use this
    # f_cost = theano.function([x, mask, ctx], cost, profile=False)
    # f_grad = theano.function([x, mask, ctx], grads, profile=False)

    # f_grad_shared computes the cost and updates adaptive learning rate variables
    # f_update updates the weights of the model
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates)

    print 'Optimization'

    # [See note in section 4.3 of paper]
    train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen)

    if valid:
        kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False)
    if test:
        kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False)

    # history_errs is a bare-bones training log that holds the validation and test error
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = numpy.load(saveto)['history_errs'].tolist()
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        print 'Epoch ', eidx

        for caps in train_iter:
            n_samples += len(caps)
            uidx += 1
            # turn on dropout
            use_noise.set_value(1.)

            # preprocess the caption, recording the
            # time spent to help detect bottlenecks
            pd_start = time.time()
            x, mask, ctx = prepare_data(caps,
                                        train[1],
                                        worddict,
                                        maxlen=maxlen,
                                        n_words=n_words)
            pd_duration = time.time() - pd_start

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            # get the cost for the minibatch, and update the weights
            ud_start = time.time()
            cost = f_grad_shared(x, mask, ctx)
            f_update(lrate)
            ud_duration = time.time() - ud_start # some monitoring for each mini-batch

            # Numerical stability check
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration

            # Checkpoint
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = copy.copy(best_p)
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
                print 'Done'

            # Print a generated sample as a sanity check
            if numpy.mod(uidx, sampleFreq) == 0:
                # turn off dropout first
                use_noise.set_value(0.)
                x_s = x
                mask_s = mask
                ctx_s = ctx
                # generate and decode the a subset of the current training batch
                for jj in xrange(numpy.minimum(10, len(caps))):
                    sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options,
                                               trng=trng, k=5, maxlen=30, stochastic=False)
                    # Decode the sample from encoding back to words
                    print 'Truth ',jj,': ',
                    for vv in x_s[:,jj]:
                        if vv == 0:
                            break
                        if vv in word_idict:
                            print word_idict[vv],
                        else:
                            print 'UNK',
                    print
                    for kk, ss in enumerate([sample[0]]):
                        print 'Sample (', kk,') ', jj, ': ',
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in word_idict:
                                print word_idict[vv],
                            else:
                                print 'UNK',
                    print

            # Log validation loss + checkpoint the model with the best validation log likelihood
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                train_err = 0
                valid_err = 0
                test_err = 0

                if valid:
                    valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean()
                if test:
                    test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean()

                history_errs.append([valid_err, test_err])

                # the model with the best validation long likelihood is saved seperately with a different name
                if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min():
                    best_p = unzip(tparams)
                    print 'Saving model with best validation ll'
                    params = copy.copy(best_p)
                    params = unzip(tparams)
                    numpy.savez(saveto+'_bestll', history_errs=history_errs, **params)
                    bad_counter = 0

                # abort training if perplexity has been increasing for too long
                if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err

        print 'Seen %d samples' % n_samples

        if estop:
            break

        if save_per_epoch:
            numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams))

    # use the best nll parameters for final checkpoint (if they exist)
    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    train_err = 0
    valid_err = 0
    test_err = 0
    if valid:
        valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid)
    if test:
        test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test)

    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
                valid_err=valid_err, test_err=test_err, history_errs=history_errs,
                **params)

    return train_err, valid_err, test_err
def main(model,
         saveto,
         k=5,
         normalize=False,
         zero_pad=False,
         n_process=5,
         datasets='dev,test',
         sampling=False,
         pkl_name=None):
    # load model model_options
    if pkl_name is None:
        pkl_name = model
    with open('%s.pkl' % pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    _, valid, test, worddict = load_data(
        path='./data',
        load_train=False,
        load_dev=True if 'dev' in datasets else False,
        load_test=True if 'test' in datasets else False)

    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # create processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(target=gen_model,
                                  args=(queue, rqueue, midx, model, options, k,
                                        normalize, word_idict, sampling))
        processes[midx].start()

    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(' '.join(ww))
        return capsw

    # unsparsify, reshape, and queue
    def _send_jobs(contexts):
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14 * 14, 512])
            if zero_pad:
                cc0 = numpy.zeros(
                    (cc.shape[0] + 1, cc.shape[1])).astype('float32')
                cc0[:-1, :] = cc
            else:
                cc0 = cc
            queue.put((idx, cc0))

    # retrieve caption from process
    def _retrieve_jobs(n_samples):
        caps = [None] * n_samples
        for idx in xrange(n_samples):
            resp = rqueue.get()
            caps[resp[0]] = resp[1]
            if numpy.mod(idx, 10) == 0:
                print 'Sample ', (idx + 1), '/', n_samples, ' Done'
        return caps

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'dev':
            print 'Development Set...',
            _send_jobs(valid[1])
            caps = _seqs2words(_retrieve_jobs(valid[1].shape[0]))
            import pdb
            pdb.set_trace()
            with open(saveto + '.dev.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            _send_jobs(test[1])
            caps = _seqs2words(_retrieve_jobs(test[1].shape[0]))
            with open(saveto + '.test.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            print 'Done'
    # end processes
    for midx in xrange(n_process):
        queue.put(None)
示例#3
0
def main(model,
         saveto,
         k=5,
         normalize=False,
         zero_pad=False,
         datasets='dev,test',
         sampling=False,
         pkl_name=None):
    # load model model_options
    if pkl_name is None:
        pkl_name = model
    with open('%s.pkl' % pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    train, valid, test, worddict = load_data(
        path='./data/coco/',
        load_train=True if 'train' in datasets else False,
        load_dev=True if 'dev' in datasets else False,
        load_test=True if 'test' in datasets else False)

    # import pdb; pdb.set_trace()
    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(' '.join(ww))
        return capsw

    # process all dev examples
    def _process_examples(contexts):
        caps = [None] * contexts.shape[0]
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14 * 14, 512])
            if zero_pad:
                cc0 = numpy.zeros(
                    (cc.shape[0] + 1, cc.shape[1])).astype('float32')
                cc0[:-1, :] = cc
            else:
                cc0 = cc
            resp = gen_model(idx, cc0, model, options, k, normalize,
                             word_idict, sampling)
            caps[resp[0]] = resp[1]
            print 'Sample ', (idx + 1), '/', contexts.shape[0], ' Done'
            print resp[1]
        return caps

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'train':
            print 'Training Set...',
            caps = _seqs2words(_process_examples(train[1]))
            # import pdb; pdb.set_trace()
            with open(saveto + '.train.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            print 'Done'
        if dd == 'dev':
            print 'Development Set...',
            caps = _seqs2words(_process_examples(valid[1]))
            # import pdb; pdb.set_trace()
            with open(saveto + '.dev.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            caps = _seqs2words(_process_examples(test[1]))
            with open(saveto + '.test.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            print 'Done'
示例#4
0
def main(pkl_names,
         models,
         split,
         k=4,
         normalize=False,
         debug=False,
         changes=None):
    # load model model_options
    f_init, f_next = [], []
    for pkl_name, model in zip(pkl_names, models):
        options = read_pkl(pkl_name)
        if args.changes is not None:
            for change in args.changes:
                options[change.split('=')[0]] = change.split('=')[1]
        # initialize the two functions
        f1, f2 = gen_model(model, options)
        f_init.append(f1)
        f_next.append(f2)

    # fetch data, skip ones we aren't using to save time
    load_data, _ = get_dataset(options['dataset'])
    kwargs = {
        'path': osp.join(options['prefix'], options['dataset']),
        'load_%s' % split: True,
        'options': options
    }
    eval_data, worddict = load_data(**kwargs)
    imgid = collapse([elem[-1] for elem in eval_data[0]])
    word_idict = {vv: kk for kk, vv in worddict.iteritems()}

    # write results to json format
    caps = process_examples(f_init,
                            f_next,
                            imgid,
                            eval_data[1],
                            eval_data[2],
                            word_idict,
                            options,
                            k,
                            normalize,
                            debug=debug)

    # create folder if not exist
    if len(pkl_names) > 1:
        folder = osp.join('../output',
                          '%s_ensemble_%s' % (options['dataset'], split))
    else:
        folder = osp.join('../output',
                          '%s_%s' % (osp.splitext(pkl_names[0])[0], split))

    # If there exists more, then create mirrows
    if not osp.exists(folder):
        os.mkdir(folder)
    elif osp.exists(folder) and split == 'test':
        for i in range(2, 5):
            if not osp.exists('%s.%d' % (folder, i)):
                folder = '%s.%d' % (folder, i)
                os.mkdir(folder)
                break

    # write json to the file
    with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f:
        json.dump(caps, f)

    if split in ('val', 'test'):
        # Evaluate using the official api
        coco_caption_folder = osp.join('../', 'coco-caption')
        assert osp.exists(coco_caption_folder)
        sys.path.append(coco_caption_folder)
        from cocoEvaluation import cocoEvaluation
        evaluator = cocoEvaluation(options['dataset'])
        evaluator.evaluate(folder)
def main(model, saveto, k=1, normalize=False, zero_pad=False, datasets='dev,test', data_path='./', sampling=False, pkl_name=None):
    # load model model_options
    if pkl_name is None:
        pkl_name = model
    with open('%s.pkl'% pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    _, valid, test, worddict = load_data(load_train=False, load_dev=True if 'dev' in datasets else False,
                                             load_test=True if 'test' in datasets else False, path=data_path)

    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # build sampler
    trng = RandomStreams(1234)
    # this is zero indicate we are not using dropout in the graph
    use_noise = theano.shared(numpy.float32(0.), name='use_noise')

    # get the parameters
    params = init_params(options)
    params = load_params(model, params)
    tparams = init_tparams(params)

    # build the sampling computational graph
    # see capgen.py for more detailed explanations
    f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling)

    # index -> words
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict[w])
        return ' '.join(ww)

    # unsparsify, reshape, and queue
    def _send_job(context):
        cc = context.todense().reshape([14*14,512])
        if zero_pad:
            cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32')
            cc0[:-1,:] = cc
        else:
            cc0 = cc
        return create_sample(tparams, f_init, f_next, cc0, options, trng, k, normalize)

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'dev':
            bar = Bar('Development Set...', max=len(valid[1]))
            caps = []
            for i in range(len(valid[1])):
                sample = _send_job(valid[1][i])
                cap = _seqs2words(sample)
                caps.append(cap)
                with open(saveto+'_status.json', 'w') as f:
                    json.dump({'current': i, 'total': len(valid[1])}, f)
                bar.next()
            bar.finish()
            with open(saveto, 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            caps = []
            for i in range(len(test[1])):
                sample = _send_job(test[1][i])
                cap = _seqs2words(sample)
                caps.append(cap)
                with open(saveto+'_status.json', 'w') as f:
                    json.dump({'current': i, 'total': len(test[1])}, f)
                bar.next()
            bar.finish()
            with open(saveto, 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
示例#6
0
def main(model, saveto, k=5, normalize=False, zero_pad=False, n_process=5, datasets='dev,test', sampling=False, pkl_name=None):
    # load model model_options
    if pkl_name is None:
        pkl_name = model
    with open('%s.pkl'% pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    _, valid, test, worddict = load_data(load_train=False, load_dev=True if 'dev' in datasets else False,
                                             load_test=True if 'test' in datasets else False)

    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'
    ipdb.set_trace()

    # create processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(target=gen_model, 
                                  args=(queue,rqueue,midx,model,options,k,normalize,word_idict, sampling))
        processes[midx].start()

    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(' '.join(ww))
        return capsw

    # unsparsify, reshape, and queue
    def _send_jobs(contexts):
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14*14,512])
            if zero_pad:
                cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32')
                cc0[:-1,:] = cc
            else:
                cc0 = cc
            queue.put((idx, cc0))

    # retrieve caption from process
    def _retrieve_jobs(n_samples):
        caps = [None] * n_samples
        for idx in xrange(n_samples):
            resp = rqueue.get()
            caps[resp[0]] = resp[1]
            if numpy.mod(idx, 10) == 0:
                print 'Sample ', (idx+1), '/', n_samples, ' Done'
        return caps

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'dev':
            print 'Development Set...',
            _send_jobs(valid[1])
            vvv = valid[1].toarray()
            caps = _seqs2words(_retrieve_jobs(len(vvv)))
            print caps
            with open(saveto+'.dev.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            _send_jobs(test[1])
            vvv = test[1].toarray()
            caps = _seqs2words(_retrieve_jobs(len(vvv)))
            print caps
            with open(saveto+'.test.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
    # end processes
    for midx in xrange(n_process):
        queue.put(None)
示例#7
0
def train(dim_word=100,  # word vector dimensionality
          ctx_dim=512,  # context vector dimensionality
          dim=1000,  # the number of LSTM units
          attn_type='deterministic',  # [see section 4 from paper]
          n_layers_att=1,  # number of layers used to compute the attention weights
          n_layers_out=1,  # number of layers used to compute logit
          n_layers_lstm=1,  # number of lstm layers
          n_layers_init=1,  # number of layers to initialize LSTM at time 0
          lstm_encoder=False,  # if True, run bidirectional LSTM on input units
          prev2out=False,  # Feed previous word into logit
          ctx2out=False,  # Feed attention weighted ctx into logit
          alpha_entropy_c=0.002,  # hard attn param
          RL_sumCost=False,  # hard attn param
          semi_sampling_p=0.5,  # hard attn param
          temperature=1.,  # hard attn param
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,  # weight decay coeff
          alpha_c=0.,  # doubly stochastic coeff
          lrate=0.01,  # used only for SGD
          selector=False,  # selector (see paper)
          n_words=10000,  # vocab size
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size = 16,
          valid_batch_size = 2,#change from 16
          saveto='model.npz',  # relative path of saved model file
          validFreq=1000,
          saveFreq=1000,  # save the parameters after every saveFreq updates
          sampleFreq=5,  # generate some samples after every sampleFreq updates
          data_path='./data',  # path to find data
          dataset='flickr30k',
          dictionary=None,  # word dictionary
          use_dropout=False,  # setting this true turns on dropout at various points
          use_dropout_lstm=False,  # dropout on lstm gates
          reload_=False,
          save_per_epoch=False): # this saves down the model every epoch

    # hyperparam dict
    model_options = locals().copy()
    model_options = validate_options(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        print "Reloading options"
        with open('%s.pkl'%saveto, 'rb') as f:
            model_options = pkl.load(f)

    print "Using the following parameters:"
    print  model_options

    print 'Loading data'
    load_data, prepare_data = get_dataset(dataset)
    train, valid, test, worddict = load_data(path=data_path)

    # index 0 and 1 always code for the end of sentence and unknown token
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Initialize (or reload) the parameters using 'model_options'
    # then build the Theano graph
    print 'Building model'
    params = init_params(model_options)
    if reload_ and os.path.exists(saveto):
        print "Reloading model"
        params = load_params(saveto, params)

    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    # In order, we get:
    #   1) trng - theano random number generator
    #   2) use_noise - flag that turns on dropout
    #   3) inps - inputs for f_grad_shared
    #   4) cost - log likelihood for each sentence
    #   5) opts_out - optional outputs (e.g selector)
    trng, use_noise, \
          inps, alphas, alphas_sample,\
          cost, \
          opt_outs = \
          build_model(tparams, model_options)


    # To sample, we use beam search: 1) f_init is a function that initializes
    # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over
    # words and also the new "initial state/memory" see equation
    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # we want the cost without any the regularizers
    # define the log probability
    f_log_probs = theano.function(inps, -cost, profile=False,
                                        updates=opt_outs['attn_updates']
                                        if model_options['attn_type']=='stochastic'
                                        else None, allow_input_downcast=True)

    # Define the cost function + Regularization
    cost = cost.mean()
    # add L2 regularization costs
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # Doubly stochastic regularization
    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean()
        cost += alpha_reg

    hard_attn_updates = []
    # Backprop!
    if model_options['attn_type'] == 'deterministic':
        grads = tensor.grad(cost, wrt=itemlist(tparams))
    else:
        # shared variables for hard attention
        baseline_time = theano.shared(numpy.float32(0.), name='baseline_time')
        opt_outs['baseline_time'] = baseline_time
        alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c')
        alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean()
        # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule]
        if model_options['RL_sumCost']:
            grads = tensor.grad(cost, wrt=itemlist(tparams),
                                disconnected_inputs='raise',
                                known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.*
                                            (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)})
        else:
            grads = tensor.grad(cost, wrt=itemlist(tparams),
                            disconnected_inputs='raise',
                            known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.*
                            (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)})
        # [equation on bottom left of page 5]
        hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())]
        # updates from scan
        hard_attn_updates += opt_outs['attn_updates']

    # to getthe cost after regularization or the gradients, use this
    # f_cost = theano.function([x, mask, ctx], cost, profile=False)
    # f_grad = theano.function([x, mask, ctx], grads, profile=False)

    # f_grad_shared computes the cost and updates adaptive learning rate variables
    # f_update updates the weights of the model
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates)

    print 'Optimization'

    # [See note in section 4.3 of paper]
    train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen)

    if valid:
        kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False)
    if test:
        kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False)

    # history_errs is a bare-bones training log that holds the validation and test error
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = numpy.load(saveto)['history_errs'].tolist()
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        print 'Epoch ', eidx

        for caps in train_iter:
            n_samples += len(caps)
            uidx += 1
            # turn on dropout
            use_noise.set_value(1.)

            # preprocess the caption, recording the
            # time spent to help detect bottlenecks
            pd_start = time.time()
            x, mask, ctx = prepare_data(caps,
                                        train[1],
                                        worddict,
                                        maxlen=maxlen,
                                        n_words=n_words)
            pd_duration = time.time() - pd_start

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            # get the cost for the minibatch, and update the weights
            ud_start = time.time()
            cost = f_grad_shared(x, mask, ctx)
            f_update(lrate)
            ud_duration = time.time() - ud_start # some monitoring for each mini-batch

            # Numerical stability check
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration

            # Checkpoint
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = copy.copy(best_p)
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
                print 'Done'

            # Print a generated sample as a sanity check
            if numpy.mod(uidx, sampleFreq) == 0:
                # turn off dropout first
                use_noise.set_value(0.)
                x_s = x
                mask_s = mask
                ctx_s = ctx
                # generate and decode the a subset of the current training batch
                for jj in xrange(numpy.minimum(10, len(caps))):
                    sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options,
                                               trng=trng, k=5, maxlen=30, stochastic=False)
                    # Decode the sample from encoding back to words
                    print 'Truth ',jj,': ',
                    for vv in x_s[:,jj]:
                        if vv == 0:
                            break
                        if vv in word_idict:
                            print word_idict[vv],
                        else:
                            print 'UNK',
                    print
                    for kk, ss in enumerate([sample[0]]):
                        print 'Sample (', kk,') ', jj, ': ',
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in word_idict:
                                print word_idict[vv],
                            else:
                                print 'UNK',
                    print

            # Log validation loss + checkpoint the model with the best validation log likelihood
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                train_err = 0
                valid_err = 0
                test_err = 0

                if valid:
                    valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean()
                if test:
                    test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean()

                history_errs.append([valid_err, test_err])

                # the model with the best validation long likelihood is saved seperately with a different name
                if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min():
                    best_p = unzip(tparams)
                    print 'Saving model with best validation ll'
                    params = copy.copy(best_p)
                    params = unzip(tparams)
                    numpy.savez(saveto+'_bestll', history_errs=history_errs, **params)
                    bad_counter = 0

                # abort training if perplexity has been increasing for too long
                if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err

        print 'Seen %d samples' % n_samples

        if estop:
            break

        if save_per_epoch:
            numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams))

    # use the best nll parameters for final checkpoint (if they exist)
    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    train_err = 0
    valid_err = 0
    test_err = 0
    if valid:
        valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid)
    if test:
        test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test)

    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
                valid_err=valid_err, test_err=test_err, history_errs=history_errs,
                **params)

    return train_err, valid_err, test_err
def main(model, saveto, k=5, normalize=False, zero_pad=False, datasets='dev,test', sampling=False, pkl_name=None):
# load model model_options
    if pkl_name is None:
        pkl_name = model
    with open('%s.pkl'% pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    train, valid, test, worddict = load_data(path='./data/flickr8k/', load_train=True if 'train' in datasets else False,
                                             load_dev=True if 'dev' in datasets else False,
                                             load_test=True if 'test' in datasets else False)
    
    import pdb; pdb.set_trace()
    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'


    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(' '.join(ww))
        return capsw

    # process all dev examples
    def _process_examples(contexts):
        caps = [None] * contexts.shape[0]
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14*14,512])
            if zero_pad:
                cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32')
                cc0[:-1,:] = cc
            else:
                cc0 = cc
            resp = gen_model(idx, cc0, model, options, k, normalize, word_idict, sampling)
            caps[resp[0]] = resp[1]
            print 'Sample ', (idx+1), '/', contexts.shape[0], ' Done'
            print resp[1]
        return caps

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'train':
            print 'Training Set...',
            caps = _seqs2words(_process_examples(train[1]))
            import pdb; pdb.set_trace()
            with open(saveto+'.train.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
        if dd == 'dev':
            print 'Development Set...',
            caps = _seqs2words(_process_examples(valid[1]))
            import pdb; pdb.set_trace()
            with open(saveto+'.dev.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            caps = _seqs2words(_process_examples(test[1]))
            with open(saveto+'.test.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            print 'Done'
def main(model, saveto, k=5, normalize=False, zero_pad=False, datasets="dev,test", sampling=False, pkl_name=None):
    # load model model_options
    if pkl_name is None:
        pkl_name = model
    with open("%s.pkl" % pkl_name, "rb") as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options["dataset"])
    train, valid, test, worddict = load_data(
        path="./data/coco/",
        load_train=True if "train" in datasets else False,
        load_dev=True if "dev" in datasets else False,
        load_test=True if "test" in datasets else False,
    )

    # import pdb; pdb.set_trace()
    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = "<eos>"
    word_idict[1] = "UNK"

    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(" ".join(ww))
        return capsw

    # process all dev examples
    def _process_examples(contexts):
        caps = [None] * contexts.shape[0]
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14 * 14, 512])
            if zero_pad:
                cc0 = numpy.zeros((cc.shape[0] + 1, cc.shape[1])).astype("float32")
                cc0[:-1, :] = cc
            else:
                cc0 = cc
            resp = gen_model(idx, cc0, model, options, k, normalize, word_idict, sampling)
            caps[resp[0]] = resp[1]
            print "Sample ", (idx + 1), "/", contexts.shape[0], " Done"
            print resp[1]
        return caps

    ds = datasets.strip().split(",")

    # send all the features for the various datasets
    for dd in ds:
        if dd == "train":
            print "Training Set...",
            caps = _seqs2words(_process_examples(train[1]))
            # import pdb; pdb.set_trace()
            with open(saveto + ".train.txt", "w") as f:
                print >> f, "\n".join(caps)
            print "Done"
        if dd == "dev":
            print "Development Set...",
            caps = _seqs2words(_process_examples(valid[1]))
            # import pdb; pdb.set_trace()
            with open(saveto + ".dev.txt", "w") as f:
                print >> f, "\n".join(caps)
            print "Done"
        if dd == "test":
            print "Test Set...",
            caps = _seqs2words(_process_examples(test[1]))
            with open(saveto + ".test.txt", "w") as f:
                print >> f, "\n".join(caps)
            print "Done"
示例#10
0
def main(model, saveto, k=5, normalize=False, zero_pad=False, n_process=5, datasets='train,dev,test', sampling=False, pkl_name=None, cate_name = None, out_name = None):

    lines = open(cate_name,'r').read().splitlines()
    ref_images = []
    weights = []
    for line in lines:
        s = line.split(',')
        ref_images.append(s[0])
        weights.append(int(s[1]))

    # load model model_options
    if pkl_name is None:
        pkl_name = model[0]

    with open('%s.pkl'% pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    train, valid, test, worddict = load_data(load_train=True if 'train' in datasets else False, load_dev=True if 'dev' in datasets else False,
                                             load_test=True if 'test' in datasets else False)

    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # create processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(target=gen_model, 
                                  args=(queue,rqueue,midx,model,options,k,normalize,word_idict, sampling))
        processes[midx].start()

    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(' '.join(ww))
        return capsw

    # unsparsify, reshape, and queue
    def _send_jobs(contexts):
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14*14,512])
            if zero_pad:
                cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32')
                cc0[:-1,:] = cc
            else:
                cc0 = cc
            queue.put((idx, cc0))
        return

    # retrieve caption from process
    def _retrieve_jobs(n_samples):
        caps = [None] * n_samples
        scores = [None] * n_samples
        for idx in xrange(n_samples):
            resp = rqueue.get()
            caps[resp[0]] = resp[1]
            scores[resp[0]] = resp[2]
            if numpy.mod(idx, 10) == 0:
                print 'Sample ', (idx+1), '/', n_samples, ' Done'
        return caps, scores

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'dev':
            print 'Development Set...',
            _send_jobs(valid[1])
            print 'Finished sending DEV'
            caps,scores = _retrieve_jobs(valid[1].shape[0])
            caps = _seqs2words(caps)
            print 'Finished Generationg DEV'
            with open(saveto+'.dev.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            with open(saveto+'.dev.scores.txt', 'w') as f:
                for score in scores:
                	print >>f, str(score)+'\n'
            with open(saveto+'.dev.info.txt', 'w') as f:
                for idx in range(len(scores)):
                	print >>f, caps[idx] +'\n'+ ref_images[idx] +'\n'+ str(scores[idx]) +'\n'


            # sents = []
            # for sen in valid[0]:
            #     while len(sents) < sen[1]+1:
            #         sents.append([])
            #     sents[sen[1]].append(sen[0].strip())
            # sents2 = zip(*sents)
            # for idd in range(5):
            #     with open(saveto+'gold'+str(idd)+'.dev.txt', 'w') as f:
            #         print >>f, '\n'.join(sents2[idd])

            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            _send_jobs(test[1])
            print 'Finished sending TEST'
            caps,scores = _retrieve_jobs(test[1].shape[0])
            caps = _seqs2words(caps)
            print 'Finished Generationg TEST'
            with open(saveto+'.test.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            with open(saveto+'.test.scores.txt', 'w') as f:
                for score in scores:
                	print >>f, str(score)+'\n'
            with open(saveto+'.test.info.txt', 'w') as f:
                for idx in range(len(scores)):
                	print >>f, caps[idx] +'\n'+ ref_images[idx] +'\n'+ str(scores[idx]) +'\n'

                

            # sents = []
            # for sen in test[0]:
            #     while len(sents) < sen[1]+1:
            #         sents.append([])
            #     sents[sen[1]].append(sen[0].strip())
            # sents2 = zip(*sents)
            # for idd in range(5):
            #     with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f:
            #         print >>f, '\n'.join(sents2[idd])

            print 'Done'
        if dd == 'train':
            print 'Train Set...',
            _send_jobs(train[1])
            print 'Finished sending TRAIN'

            caps, scores = _retrieve_jobs(train[1].shape[0])
            caps = _seqs2words(caps)

            # all_caps,all_scores = _retrieve_jobs(train[1].shape[0])
            # all_caps = _seqs2words(all_caps)
            # caps = []
            # scores = []
            # index = 0
            # for i in xrange(len(weights)):
            #     if weights[i] == 0:
            #         scores.append(0)
            #     else:
            #         caps.append(all_caps[index])
            #         scores.append(all_scores[index])
                
            #     index += weights[i]

            print 'Finished Generationg TRAIN'
            with open(saveto+'.train.txt', 'w') as f:
                print >>f, '\n'.join(caps)
            # with open(saveto+'.train.scores.txt', 'w') as f:
            #     for score in scores:
            #     	print >>f, str(score)+'\n'

            threshold = 1.0
            avgScore = sum(scores) / float(len(scores))
            totalWeight = float(sum(weights))
            loss = 0;

            with open(out_name, 'w') as f:
                for i in range(len(scores)):
                    if scores[i] > threshold:
                        loss += float(weights[i]) / totalWeight
                    #modelWeight += float(weights[i])/totalWeight / scores[i] 

                    if scores[i] > 1.2*avgScore and weights[i] <= 10:
                        weights[i] = weights[i]+1
                    if scores[i] < 0.5*avgScore and weights[i] > 0:
                        weights[i] = weights[i]-1
                    print >>f, ref_images[i]+','+str(weights[i])
            

            modelWeight = 0.5 * numpy.log(1/loss - 1)


            with open(cate_name[:-4]+'.info.txt', 'w') as f:
                print >>f, 'ModelWeight:'+str(modelWeight)
                for idx in range(len(scores)):
                    print >>f, caps[idx] +'\n'+ ref_images[idx] +'\n'+ str(scores[idx]) +'\n'
            # sents = []
            # for sen in test[0]:
            #     while len(sents) < sen[1]+1:
            #         sents.append([])
            #     sents[sen[1]].append(sen[0].strip())
            # sents2 = zip(*sents)
            # for idd in range(5):
            #     with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f:
            #         print >>f, '\n'.join(sents2[idd])

            print 'Done'
    # end processes
    for midx in xrange(n_process):
        queue.put(None)
    return 
示例#11
0
def main(model,
         saveto,
         k=5,
         normalize=False,
         zero_pad=False,
         n_process=5,
         datasets='train,dev,test',
         sampling=False,
         pkl_name=None,
         cate_name=None,
         out_name=None):

    lines = open(cate_name, 'r').read().splitlines()
    ref_images = []
    weights = []
    for line in lines:
        s = line.split(',')
        ref_images.append(s[0])
        weights.append(int(s[1]))

    # load model model_options
    if pkl_name is None:
        pkl_name = model[0]

    with open('%s.pkl' % pkl_name, 'rb') as f:
        options = pkl.load(f)

    # fetch data, skip ones we aren't using to save time
    load_data, prepare_data = get_dataset(options['dataset'])
    train, valid, test, worddict = load_data(
        load_train=True if 'train' in datasets else False,
        load_dev=True if 'dev' in datasets else False,
        load_test=True if 'test' in datasets else False)

    # <eos> means end of sequence (aka periods), UNK means unknown
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # create processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(target=gen_model,
                                  args=(queue, rqueue, midx, model, options, k,
                                        normalize, word_idict, sampling))
        processes[midx].start()

    # index -> words
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict[w])
            capsw.append(' '.join(ww))
        return capsw

    # unsparsify, reshape, and queue
    def _send_jobs(contexts):
        for idx, ctx in enumerate(contexts):
            cc = ctx.todense().reshape([14 * 14, 512])
            if zero_pad:
                cc0 = numpy.zeros(
                    (cc.shape[0] + 1, cc.shape[1])).astype('float32')
                cc0[:-1, :] = cc
            else:
                cc0 = cc
            queue.put((idx, cc0))
        return

    # retrieve caption from process
    def _retrieve_jobs(n_samples):
        caps = [None] * n_samples
        scores = [None] * n_samples
        for idx in xrange(n_samples):
            resp = rqueue.get()
            caps[resp[0]] = resp[1]
            scores[resp[0]] = resp[2]
            if numpy.mod(idx, 10) == 0:
                print 'Sample ', (idx + 1), '/', n_samples, ' Done'
        return caps, scores

    ds = datasets.strip().split(',')

    # send all the features for the various datasets
    for dd in ds:
        if dd == 'dev':
            print 'Development Set...',
            _send_jobs(valid[1])
            print 'Finished sending DEV'
            caps, scores = _retrieve_jobs(valid[1].shape[0])
            caps = _seqs2words(caps)
            print 'Finished Generationg DEV'
            with open(saveto + '.dev.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            with open(saveto + '.dev.scores.txt', 'w') as f:
                for score in scores:
                    print >> f, str(score) + '\n'
            with open(saveto + '.dev.info.txt', 'w') as f:
                for idx in range(len(scores)):
                    print >> f, caps[idx] + '\n' + ref_images[
                        idx] + '\n' + str(scores[idx]) + '\n'

            # sents = []
            # for sen in valid[0]:
            #     while len(sents) < sen[1]+1:
            #         sents.append([])
            #     sents[sen[1]].append(sen[0].strip())
            # sents2 = zip(*sents)
            # for idd in range(5):
            #     with open(saveto+'gold'+str(idd)+'.dev.txt', 'w') as f:
            #         print >>f, '\n'.join(sents2[idd])

            print 'Done'
        if dd == 'test':
            print 'Test Set...',
            _send_jobs(test[1])
            print 'Finished sending TEST'
            caps, scores = _retrieve_jobs(test[1].shape[0])
            caps = _seqs2words(caps)
            print 'Finished Generationg TEST'
            with open(saveto + '.test.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            with open(saveto + '.test.scores.txt', 'w') as f:
                for score in scores:
                    print >> f, str(score) + '\n'
            with open(saveto + '.test.info.txt', 'w') as f:
                for idx in range(len(scores)):
                    print >> f, caps[idx] + '\n' + ref_images[
                        idx] + '\n' + str(scores[idx]) + '\n'

            # sents = []
            # for sen in test[0]:
            #     while len(sents) < sen[1]+1:
            #         sents.append([])
            #     sents[sen[1]].append(sen[0].strip())
            # sents2 = zip(*sents)
            # for idd in range(5):
            #     with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f:
            #         print >>f, '\n'.join(sents2[idd])

            print 'Done'
        if dd == 'train':
            print 'Train Set...',
            _send_jobs(train[1])
            print 'Finished sending TRAIN'

            caps, scores = _retrieve_jobs(train[1].shape[0])
            caps = _seqs2words(caps)

            # all_caps,all_scores = _retrieve_jobs(train[1].shape[0])
            # all_caps = _seqs2words(all_caps)
            # caps = []
            # scores = []
            # index = 0
            # for i in xrange(len(weights)):
            #     if weights[i] == 0:
            #         scores.append(0)
            #     else:
            #         caps.append(all_caps[index])
            #         scores.append(all_scores[index])

            #     index += weights[i]

            print 'Finished Generationg TRAIN'
            with open(saveto + '.train.txt', 'w') as f:
                print >> f, '\n'.join(caps)
            # with open(saveto+'.train.scores.txt', 'w') as f:
            #     for score in scores:
            #     	print >>f, str(score)+'\n'

            threshold = 1.0
            avgScore = sum(scores) / float(len(scores))
            totalWeight = float(sum(weights))
            loss = 0

            with open(out_name, 'w') as f:
                for i in range(len(scores)):
                    if scores[i] > threshold:
                        loss += float(weights[i]) / totalWeight
                    #modelWeight += float(weights[i])/totalWeight / scores[i]

                    if scores[i] > 1.2 * avgScore and weights[i] <= 10:
                        weights[i] = weights[i] + 1
                    if scores[i] < 0.5 * avgScore and weights[i] > 0:
                        weights[i] = weights[i] - 1
                    print >> f, ref_images[i] + ',' + str(weights[i])

            modelWeight = 0.5 * numpy.log(1 / loss - 1)

            with open(cate_name[:-4] + '.info.txt', 'w') as f:
                print >> f, 'ModelWeight:' + str(modelWeight)
                for idx in range(len(scores)):
                    print >> f, caps[idx] + '\n' + ref_images[
                        idx] + '\n' + str(scores[idx]) + '\n'
            # sents = []
            # for sen in test[0]:
            #     while len(sents) < sen[1]+1:
            #         sents.append([])
            #     sents[sen[1]].append(sen[0].strip())
            # sents2 = zip(*sents)
            # for idd in range(5):
            #     with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f:
            #         print >>f, '\n'.join(sents2[idd])

            print 'Done'
    # end processes
    for midx in xrange(n_process):
        queue.put(None)
    return
def train(
    dim_word=300,  # word vector dimensionality
    ctx_dim=300,  # context vector dimensionality
    semantic_dim=300,
    dim=1000,  # the number of LSTM units
    cnn_dim=4096,  # CNN feature dimension
    n_layers_att=1,  # number of layers used to compute the attention weights
    n_layers_out=1,  # number of layers used to compute logit
    n_layers_lstm=1,  # number of lstm layers
    n_layers_init=1,  # number of layers to initialize LSTM at time 0
    lstm_encoder=True,  # if True, run bidirectional LSTM on input units
    prev2out=False,  # Feed previous word into logit
    ctx2out=False,  # Feed attention weighted ctx into logit
    cutoff=10,
    patience=5,
    max_epochs=30,
    dispFreq=500,
    decay_c=0.,  # weight decay coeff
    alpha_c=0.,  # doubly stochastic coeff
    lrate=1e-4,  # used only for SGD
    selector=False,  # selector (see paper)
    maxlen=30,  # maximum length of the description
    optimizer='rmsprop',
    pretrained='',
    batch_size=256,
    saveto='model',  # relative path of saved model file
    saveFreq=1000,  # save the parameters after every saveFreq updates
    sampleFreq=100,  # generate some samples after every sampleFreq updates
    embedding='../Data/GloVe/vocab_glove.pkl',
    cnn_type='vgg',
    prefix='../Data',  # path to find data
    dataset='coco',
    criterion='Bleu_4',
    switch_test_val=False,
    use_cnninit=True,
    use_dropout=True,  # setting this true turns on dropout at various points
    use_dropout_lstm=False,  # dropout on lstm gates
    save_per_epoch=False):  # this saves down the model every epoch

    # hyperparam dict
    model_options = locals().copy()
    model_options = validate_options(model_options)

    # reload options
    if os.path.exists('%s.pkl' % saveto):
        print "Reloading options"
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print "Using the following parameters:"
    print model_options

    print 'Loading data'
    load_data, prepare_data = get_dataset(model_options['dataset'])

    # Load data from data path
    if 'switch_test_val' in model_options and model_options['switch_test_val']:
        train, valid, worddict = load_data(path=osp.join(
            model_options['prefix'], model_options['dataset']),
                                           options=model_options,
                                           load_train=True,
                                           load_test=True)
    else:
        train, valid, worddict = load_data(path=osp.join(
            model_options['prefix'], model_options['dataset']),
                                           options=model_options,
                                           load_train=True,
                                           load_val=True)

    # Automatically calculate the update frequency
    validFreq = len(train[0]) / model_options['batch_size']
    print "Validation frequency is %d" % validFreq

    word_idict = {vv: kk for kk, vv in worddict.iteritems()}
    model_options['n_words'] = len(worddict)

    # Initialize (or reload) the parameters using 'model_options'
    # then build the Theano graph
    print 'Building model'
    params = init_params(model_options)
    # Initialize it with glove
    if 'VCemb' in params:
        params['VCemb'] = read_pkl(
            model_options['embedding']).astype('float32')

    # If there is a same experiment, don't use pretrained weights
    if os.path.exists('%s.npz' % saveto):
        print "Reloading model"
        params = load_params('%s.npz' % saveto, params)
    elif pretrained != '':
        params = load_params(pretrained, params,
                             False)  # Only pretrain the Language model

    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    # In order, we get:
    #   1) trng - theano random number generator
    #   2) use_noise - flag that turns on dropout
    #   3) inps - inputs for f_grad_shared
    #   4) cost - log likelihood for each sentence
    #   5) opts_out - optional outputs (e.g selector)
    trng, use_noise, \
          inps, alphas,\
          cost, \
          opt_outs = \
          build_model(tparams, model_options)

    # Load evaluator to calculate bleu score
    evaluator = cocoEvaluation(model_options['dataset'])

    # To sample, we use beam search: 1) f_init is a function that initializes
    # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over
    # words and also the new "initial state/memory" see equation
    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # we want the cost without any the regularizers
    # define the log probability
    f_log_probs = theano.function(inps,
                                  -cost,
                                  profile=False,
                                  updates=None,
                                  allow_input_downcast=True)

    # Define the cost function + Regularization
    cost = cost.mean()
    # add L2 regularization costs
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # Doubly stochastic regularization
    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = sum([
            alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean()
            for alpha in alphas
        ])
        cost += alpha_reg

    # Backprop!
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    # to getthe cost after regularization or the gradients, use this

    # f_grad_shared computes the cost and updates adaptive learning rate variables
    # f_update updates the weights of the model
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams,
                                                               grads, inps,
                                                               cost)

    print 'Optimization'
    train_iter = HomogeneousData(train,
                                 batch_size=batch_size,
                                 maxlen=model_options['maxlen'])

    # history_bleu is a bare-bones training log, reload history
    history_bleu = []
    if os.path.exists('%s.npz' % saveto):
        history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist()
    start_epochs = len(history_bleu)
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(start_epochs, model_options['max_epochs']):
        n_samples = 0

        print 'Epoch ', eidx

        for caps in train_iter:
            n_samples += len(caps)
            uidx += 1
            # turn on dropout
            use_noise.set_value(1.)

            # preprocess the caption, recording the
            # time spent to help detect bottlenecks
            pd_start = time.time()
            x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2],
                                                   worddict, model_options)
            pd_duration = time.time() - pd_start

            if x is None:
                print 'Minibatch with zero sample under length ', model_options[
                    'maxlen']
                continue

            # get the cost for the minibatch, and update the weights
            ud_start = time.time()
            cost = f_grad_shared(x, mask, ctx, cnn_feats)

            print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost)

            f_update(model_options['lrate'])
            ud_duration = time.time(
            ) - ud_start  # some monitoring for each mini-batch

            # Numerical stability check
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration

            # Print a generated sample as a sanity check
            if numpy.mod(uidx, model_options['sampleFreq']) == 0:
                # turn off dropout first
                use_noise.set_value(0.)
                x_s = x
                mask_s = mask
                ctx_s = ctx
                # generate and decode the a subset of the current training batch
                for jj in xrange(numpy.minimum(10, len(caps))):
                    sample, score, alphas = gen_sample(
                        f_init,
                        f_next,
                        ctx_s[jj],
                        cnn_feats[jj],
                        model_options,
                        trng=trng,
                        maxlen=model_options['maxlen'])
                    # Decode the sample from encoding back to words
                    print 'Truth ', jj, ': ',
                    print seqs2words(x_s[:, jj], word_idict)
                    for kk, ss in enumerate([sample[0]]):
                        print 'Sample (', kk, ') ', jj, ': ',
                        print seqs2words(ss, word_idict)

            # Log validation loss + checkpoint the model with the best validation log likelihood
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)

                # Do evaluation on validation set
                imgid = collapse([elem[-1] for elem in valid[0]])
                caps = process_examples([f_init], [f_next], imgid, valid[1],
                                        valid[2], word_idict, model_options)
                folder = osp.join('../output', '%s_%s' % (saveto, 'val'))
                if not osp.exists(folder):
                    os.mkdir(folder)
                with open(osp.join(folder, 'captions_val2014_results.json'),
                          'w') as f:
                    json.dump(caps, f)
                eva_result = evaluator.evaluate(folder, False)
                if model_options['criterion'] == 'combine':
                    history_bleu.append(eva_result['Bleu_4'] +
                                        eva_result['CIDEr'])
                else:
                    history_bleu.append(eva_result[model_options['criterion']])

                # the model with the best validation long likelihood is saved seperately with a different name
                if uidx == 0 or history_bleu[-1] == max(history_bleu):
                    best_p = unzip(tparams)
                    print 'Saving model with best validation ll'
                    params = copy.copy(best_p)
                    params = unzip(tparams)
                    numpy.savez(saveto + '_bestll',
                                history_bleu=history_bleu,
                                **params)
                    bad_counter = 0

                # abort training if perplexity has been increasing for too long
                if len(history_bleu) > model_options[
                        'patience'] and history_bleu[-1] <= max(
                            history_bleu[:-model_options['patience']]):
                    bad_counter += 1
                    if bad_counter > model_options['patience']:
                        print 'Early Stop!'
                        estop = True
                        break

                print ' BLEU-4 score ', history_bleu[-1]

            # Checkpoint
            if numpy.mod(uidx, model_options['saveFreq']) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = copy.copy(best_p)
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_bleu=history_bleu, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

        print 'Seen %d samples' % n_samples

        if estop:
            break

        if model_options['save_per_epoch']:
            numpy.savez(saveto + '_epoch_' + str(eidx + 1),
                        history_bleu=history_bleu,
                        **unzip(tparams))

    # use the best nll parameters for final checkpoint (if they exist)
    if best_p is not None:
        zipp(best_p, tparams)
    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_bleu=history_bleu,
                **params)