예제 #1
0
    def get_data(self, set_chosen):
        if set_chosen == 'train':
            return dataIterator(self.datasets[0],
                                self.datasets[1],
                                self.worddicts,
                                batch_size=self.batch_size,
                                batch_Imagesize=self.batch_Imagesize,
                                maxlen=self.maxlen,
                                maxImagesize=self.maxImagesize)

        else:
            return dataIterator(self.valid_datasets[0],
                                self.valid_datasets[1],
                                self.worddicts,
                                batch_size=self.batch_size,
                                batch_Imagesize=self.batch_Imagesize,
                                maxlen=self.maxlen,
                                maxImagesize=self.maxImagesize)
예제 #2
0
def train(dataset='../data/traindataRnn',#path to train data
          dictionary='../data/traindataRnnlm.pickle',#path to rnnlm dictionary
          batch_size=50,
          max_epochs=15,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          dim_word=100,  # word vector dimensionality
          dim=1000,
          save_path='/path/to/save/model'):

    charDict = {}
    with open(dictionary, 'rb') as f:
        charDict = pkl.load(f)
    charDict_r = {}
    for kk, vv in charDict.items():
        charDict_r[vv] = kk
    vocab_size = len(charDict)

    traindata = dataIterator(dataset, dictionary, batch_size)

    #sentencelen = 7
    model = Sequential()
    #model.add(Embedding(vocab_size, dim_word, input_length=sentencelen))
    model.add(Embedding(vocab_size, dim_word, mask_zero=True))
    model.add(LSTM(output_dim=dim, return_sequences=True, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(LSTM(output_dim=dim, return_sequences=True, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

    update = 0
    for epochid in range(max_epochs):
        for x in traindata:
            x, y = prepare_traindata(x, vocab_size)
            train_loss = model.train_on_batch(x, y)
            update += 1
            if update % dispFreq == 0:
                print "Epoch:\t%d\tUPdate:\t%d\tloss:\t" % (epochid, update),
                print(train_loss)
            if update >= finish_after:
                break
        print("save model!")
        save_name = save_path + "rnnlm_epoch%d.h5" % epochid
        model.save(save_name)
        if update >= finish_after:
            break
예제 #3
0
    def load_data(self):
        datasets = ['./data/offline-train.pkl', './data/train_caption.txt']
        dictionaries = ['./data/dictionary.txt']

        worddicts = load_dict(dictionaries[0])
        worddicts_r = [None] * len(worddicts)

        for kk, vv in worddicts.items():
            worddicts_r[vv] = kk

        self.train, self.train_uid_list = dataIterator(
            datasets[0],
            datasets[1],
            worddicts,
            batch_size=self.batch_size,
            batch_Imagesize=self.batch_Imagesize,
            maxlen=self.maxlen,
            maxImagesize=self.maxImagesize)
예제 #4
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          patience=10,  # early stopping patience
          max_epochs=100,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 regularization penalty
          alpha_c=0.,  # alignment regularization
          clip_c=-1.,  # gradient clipping threshold
          lrate=0.01,  # learning rate
          n_words=10000,  # vocabulary size
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=16,
          saveto='model.npz',
          validFreq=1000,
          saveFreq=1000,   # save the parameters after every saveFreq updates
          sampleFreq=100,   # generate some samples after every sampleFreq
          datasets='../data/traindataRnn',
          valid_datasets='../data/vailiddataRnn',
          dictionaries='../data/traindataRnn.pickle',
          numofs = 1,
          use_dropout=False,
          reload_=False,
          overwrite=False):

    # Model options
    model_options = locals().copy()

    # load dictionaries and invert them
    charDict = {}
    with open(dictionaries, 'rb') as f:
        charDict = pkl.load(f)
    charDict_r = {}
    for kk, vv in charDict.items():
        charDict_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        print('Reloading model options')
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print('Loading data')
    train = dataIterator(datasets, dictionaries, batch_size)
    valid = dataIterator(valid_datasets, dictionaries, valid_batch_size)

    print('Building model')
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print('Reloading model parameters')
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print('Building sampler')
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    print('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    print('Done')

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print('Building f_cost...')
    f_cost = theano.function(inps, cost, profile=profile)
    print('Done')

    print('Computing gradient...')
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print('Done')

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c**2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print('Building optimizers...')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print('Done')

    print('Optimization')

    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    for eidx in range(max_epochs):
        n_samples = 0

        for x in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, numofs = numofs)

            if x is None:
                print('Minibatch with zero sample')
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print('NaN detected')
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print('Epoch %d Update %d Cost %f' % (eidx, uidx, cost))


            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in range(numpy.minimum(5, x.shape[1])):
                    sample, score = gen_sample(tparams, f_init, f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               maxlen=7,
                                               argmax=False)
                    print 'Source %d: ' % jj,
                    for vv in x[:, jj]:
                        if vv in charDict_r:
                            print charDict_r[vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth %d: ' % jj,
                    for vv in y[:, jj]:
                        if vv in charDict_r:
                            print charDict_r[vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample %d: ' % jj,
                    ss = sample
                    for vv in ss:
                        if vv in charDict_r:
                            print charDict_r[vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid, numofs = numofs)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print('Early Stop!')
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print('Valid %f' % valid_err)

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print('Saving the best model...')
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print('Done')

                # save with uidx
                if not overwrite:
                    print('Saving the model at iteration {}...'.format(uidx))
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx, history_errs=history_errs,
                                uidx=uidx, **unzip(tparams))
                    print('Done')

            # finish after this many updates
            if uidx >= finish_after:
                print('Finishing after %d iterations!' % uidx)
                estop = True
                break

        print('Seen %d samples' % n_samples)

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data,
                           model_options, valid).mean()

    print('Valid %f' % valid_err)

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
# flag to remember when to change the learning rate
flag = 0
# exprate
exprate = 0

# worddicts
worddicts = load_dict(dictionaries[0])
worddicts_r = [None] * len(worddicts)
for kk, vv in worddicts.items():
    worddicts_r[vv] = kk

#load train data and test data
train, train_label = dataIterator(datasets[0],
                                  datasets[1],
                                  worddicts,
                                  batch_size=1,
                                  batch_Imagesize=batch_Imagesize,
                                  maxlen=maxlen,
                                  maxImagesize=maxImagesize)
len_train = len(train)

test, test_label = dataIterator(valid_datasets[0],
                                valid_datasets[1],
                                worddicts,
                                batch_size=1,
                                batch_Imagesize=batch_Imagesize,
                                maxlen=maxlen,
                                maxImagesize=maxImagesize)
len_test = len(test)

예제 #6
0
파일: nmt.py 프로젝트: xqustc/TAP
def train(
        dim_word=100,  # word vector dimensionality
        dim_enc=1000,
        dim_dec=1000,
        down_sample=0,
        dim_attention=500,
        dim_coverage=5,
        kernel_coverage=121,
        encoder='gru',
        decoder='gru_cond',
        patience=4,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=1e-8,  # learning rate
        dim_target=62,  # source vocabulary size
        dim_feature=123,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=['feature.pkl', 'label.txt'],
        valid_datasets=['feature_valid.pkl', 'label_valid.txt'],
        dictionaries=['lexicon.txt'],
        valid_output=['decode.txt'],
        valid_result=['result.txt'],
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionaries and invert them

    worddicts = load_dict(dictionaries[0])
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train, train_uid_list = dataIterator(datasets[0],
                                         datasets[1],
                                         worddicts,
                                         batch_size=batch_size,
                                         maxlen=maxlen)
    valid, valid_uid_list = dataIterator(valid_datasets[0],
                                         valid_datasets[1],
                                         worddicts,
                                         batch_size=batch_size,
                                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Buliding sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    # print model parameters
    print "Model params:\n{0}".format(
        pprint.pformat(sorted([p for p in params])))
    # end

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train)
    if saveFreq == -1:
        saveFreq = len(train)
    if sampleFreq == -1:
        sampleFreq = len(train)

    uidx = 0
    estop = False
    halfLrFlag = 0
    bad_counter = 0
    ud_s = 0
    ud_epoch = 0
    cost_s = 0.
    for eidx in xrange(max_epochs):
        n_samples = 0

        random.shuffle(train)  # shuffle data
        ud_epoch_start = time.time()

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            ud_start = time.time()

            x, x_mask, y, y_mask = prepare_data(model_options,
                                                x,
                                                y,
                                                maxlen=maxlen)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)
            cost_s += cost

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start
            ud_s += ud

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud_s /= 60.
                cost_s /= dispFreq
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost_s, 'UD ', ud_s, 'epson ', lrate, 'bad_counter', bad_counter
                ud_s = 0
                cost_s = 0.

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                fpp_sample = open(valid_output[0], 'w')
                valid_count_idx = 0
                # FIXME: random selection?
                for x, y in valid:
                    for xx in x:
                        xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]),
                                             dtype='float32')
                        xx_pad[:xx.shape[0], :] = xx
                        stochastic = False
                        sample, score = gen_sample(tparams,
                                                   f_init,
                                                   f_next,
                                                   xx_pad[:, None, :],
                                                   model_options,
                                                   trng=trng,
                                                   k=10,
                                                   maxlen=1000,
                                                   stochastic=stochastic,
                                                   argmax=False)

                        if stochastic:
                            ss = sample
                        else:
                            score = score / numpy.array(
                                [len(s) for s in sample])
                            ss = sample[score.argmin()]

                        fpp_sample.write(valid_uid_list[valid_count_idx])
                        valid_count_idx = valid_count_idx + 1
                        for vv in ss:
                            if vv == 0:  # <eol>
                                break
                            fpp_sample.write(' ' + worddicts_r[vv])
                        fpp_sample.write('\n')
                fpp_sample.close()
                print 'valid set decode done'
                ud_epoch = (time.time() - ud_epoch_start) / 60.
                print 'cost time ... ', ud_epoch

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err_cost = valid_errs.mean()

                # compute wer
                os.system('python compute-wer.py ' + valid_output[0] + ' ' +
                          valid_datasets[1] + ' ' + valid_result[0])
                fpp = open(valid_result[0])
                stuff = fpp.readlines()
                fpp.close()
                m = re.search('WER (.*)\n', stuff[0])
                valid_per = 100. * float(m.group(1))
                m = re.search('ExpRate (.*)\n', stuff[1])
                valid_sacc = 100. * float(m.group(1))
                #valid_err=0.7*valid_per-0.3*valid_sacc
                valid_err = valid_per
                history_errs.append(valid_err)

                if uidx / validFreq == 0 or valid_err <= numpy.array(
                        history_errs).min(
                        ):  # the first time valid or worse model
                    best_p = unzip(tparams)
                    bad_counter = 0
                # if len(history_errs) > patience and valid_err >= \
                #         numpy.array(history_errs)[:-patience].min():
                #     bad_counter += 1
                #     if bad_counter > patience:
                #         print 'Early Stop!'
                #         estop = True
                #         break
                if uidx / validFreq != 0 and valid_err > numpy.array(
                        history_errs).min():
                    bad_counter += 1
                    if bad_counter > patience:
                        if halfLrFlag == 1:
                            print 'Early Stop!'
                            estop = True
                            break
                        else:
                            print 'Lr decay and retrain!'
                            bad_counter = 0
                            lrate /= 10
                            params = best_p
                            halfLrFlag += 1

                if numpy.isnan(valid_err):
                    #ipdb.set_trace()
                    print 'valid_err nan'

                print 'Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' % (
                    valid_per, valid_sacc, valid_err_cost)

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
예제 #7
0
def main(model, bn_model, dictionary_target, fea, latex, saveto, output, k=5):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    valid, valid_uid_list = dataIterator(fea,
                                         latex,
                                         worddicts,
                                         batch_size=1,
                                         batch_Imagesize=500000,
                                         maxlen=500,
                                         maxImagesize=500000)

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)
    bn_params = init_bn_params(options)
    # load model parameters and set theano shared variables
    params = load_params(model, params)
    bn_params = load_params(bn_model, bn_params)
    tparams = init_tparams(params)
    bn_tparams = init_tparams(bn_params)
    f_init, f_next = build_sampler(tparams, bn_tparams, options, trng,
                                   use_noise)

    use_noise.set_value(0.)

    fpp_sample = open(saveto, 'w')
    valid_count_idx = 0
    # FIXME: random selection?
    print 'Decoding ... '
    for x, y in valid:
        for xx in x:
            print '%d : %s' % (valid_count_idx + 1,
                               valid_uid_list[valid_count_idx])
            xx_pad = numpy.zeros(
                (xx.shape[0], xx.shape[1], xx.shape[2]),
                dtype='float32')  # input_channels * height * width
            xx_pad[:, :, :] = xx / 255.
            stochastic = False
            sample, score = gen_sample(f_init,
                                       f_next,
                                       xx_pad[None, :, :, :],
                                       options,
                                       trng=trng,
                                       k=10,
                                       maxlen=1000,
                                       stochastic=stochastic,
                                       argmax=False)

            if stochastic:
                ss = sample
            else:
                score = score / numpy.array([len(s) for s in sample])
                ss = sample[score.argmin()]

            fpp_sample.write(valid_uid_list[valid_count_idx])
            valid_count_idx = valid_count_idx + 1
            for vv in ss:
                if vv == 0:  # <eol>
                    break
                fpp_sample.write(' ' + worddicts_r[vv])
            fpp_sample.write('\n')
    fpp_sample.close()
    print 'test set decode done'

    os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output)
    fpp = open(output)  # %WER 31.63
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    valid_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    valid_sacc = 100. * float(m.group(1))

    print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
예제 #8
0
def main(model,
         dictionary_target,
         source_fea,
         source_latex,
         saveto,
         wer_file,
         k=5):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    valid, valid_uid_list = dataIterator(source_fea,
                                         source_latex,
                                         worddicts,
                                         batch_size=1,
                                         maxlen=2000)

    trng = RandomStreams(1234)

    params = init_params(options)
    params = load_params(model, params)
    tparams = init_tparams(params)
    f_init, f_next = build_sampler(tparams, options, trng)

    fpp_sample = open(saveto, 'w')
    valid_count_idx = 0

    print 'Decoding...'
    ud_epoch = 0
    ud_epoch_start = time.time()
    for x, y in valid:
        for xx in x:
            print '%d : %s' % (valid_count_idx + 1,
                               valid_uid_list[valid_count_idx])
            xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]),
                                 dtype='float32')
            xx_pad[:xx.shape[0], :] = xx
            stochastic = False
            sample, score = gen_sample(f_init,
                                       f_next,
                                       xx_pad[:, None, :],
                                       options,
                                       trng=trng,
                                       k=k,
                                       maxlen=1000,
                                       stochastic=stochastic,
                                       argmax=False)

            if stochastic:
                ss = sample
            else:
                score = score / numpy.array([len(s) for s in sample])
                ss = sample[score.argmin()]

            fpp_sample.write(valid_uid_list[valid_count_idx])
            valid_count_idx = valid_count_idx + 1
            for vv in ss:
                if vv == 0:  # <eol>
                    break
                fpp_sample.write(' ' + worddicts_r[vv])
            fpp_sample.write('\n')
    fpp_sample.close()
    ud_epoch = (time.time() - ud_epoch_start) / 60.
    print 'test set decode done, cost time ...', ud_epoch
    os.system('python compute-wer.py ' + saveto + ' ' + source_latex + ' ' +
              wer_file)
    fpp = open(wer_file)
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    valid_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    valid_sacc = 100. * float(m.group(1))

    print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
예제 #9
0
def getData(
        file='ad_action_state',
        batch_size=32,
        shuffle_each_epoch=True,
    ):
    df = pd.read_csv(file)

    df['adgroup_id'] = pd.Series(df['adgroup_id'], dtype=np.str)

    # build dict
    for _, row in df.iterrows():
        adgroup_id = row['adgroup_id']
        if adgroup_id not in adgroup_dict:
            adgroup_dict[adgroup_id] = len(adgroup_dict)

        effect_info = row['effect_data'].split(';')
        for s in effect_info:
            _, s = s.split(':')
            if _ not in tmp_days_set:
                tmp_days_set.add(_)
            s = s.split(',')
            for t in s:
                t, _ = t.split('=')
                if t not in effect_dict:
                    effect_dict[t] = len(effect_dict)
            
        pos_info = row['pos_ratio'].split(';')
        for s in pos_info:
            _, s = s.split(':')
            s = s.split(',')
            for t in s:
                t, _ = t.split('=')
                if t not in pos_dict:
                    pos_dict[t] = len(pos_dict)
                    
        direct_info = row['direct_type_price'].split(';')
        for s in direct_info:
            _, s = s.split(':')
            s = s.split(',')
            for t in s:
                t, _ = t.split('=')
                if t not in direct_dict:
                    direct_dict[t] = len(direct_dict)
                    
        ad_feature = row['ad_feature'].split(';')
        for s in ad_feature:
            _, s = s.split(':')
            s = s.split(',')
            for t in s:
                name, num = t.split('=')
                if name == 'member_id':
                    if num not in member_dict:
                        member_dict[num] = len(member_dict)
                elif name == 'campaign_id':
                    if num not in campaign_dict:
                        campaign_dict[num] = len(campaign_dict)
                elif name == 'adgroup_id':
                    if num not in adgroup_dict:
                        adgroup_dict[num] = len(adgroup_dict)
                elif name == 'item_id':
                    if num not in item_dict:
                        item_dict[num] = len(item_dict)
                elif name == 'cate_id':
                    if num not in cate_dict:
                        cate_dict[num] = len(cate_dict)
                elif name == 'commodity_id':
                    if num not in commodity_dict:
                        commodity_dict[num] = len(commodity_dict)
                elif name == 'node_id':
                    if num not in node_dict:
                        node_dict[num] = len(node_dict)

    tmp_days_list = list(tmp_days_set)
    list.sort(tmp_days_list)
    for i, x in enumerate(tmp_days_list):
        days_dict[x] = i

    print(days_dict)    

    train_set = []
    test_set = []

    # parse data
    for _, row in df.iterrows():
        data = []
        
        # label
        data.append(row['label'])
        data.append(0)
        
        # ad feature
        adgroup_id = get_id(adgroup_dict, row['adgroup_id'])
        data.append(adgroup_id)
        
        ad_feature = (row['ad_feature'].split(';')[0]).split(':')[1]
        ad_feature = ad_feature.split(',')
        for x in ad_feature:
            name, entry = x.split('=')
            fid = 0
            if name == 'member_id':
                fid = get_id(member_dict, entry)
                data.append(fid)
            elif name == 'campaign_id':
                fid = get_id(campaign_dict, entry)
                data.append(fid)
            elif name == 'item_id':
                fid = get_id(item_dict, entry)
                data.append(fid)
            elif name == 'item_price':
                item_price = float(entry) / 100.0
                data.append(item_price)
            elif name == 'cate_id':
                fid = get_id(cate_dict, entry)
                data.append(fid)
            elif name == 'commodity_id':
                fid = get_id(commodity_dict, entry)
                data.append(fid)
            elif name == 'node_id':
                fid = get_id(node_dict, entry)
                data.append(fid)
        
        days_num = len(days_dict)
        
        # effect data
        effect_list = [[0.0] * len(effect_dict) for _ in range(days_num)]
        effect_data = row['effect_data'].split(';')
        mmax = np.array([0.0 for i in range(len(effect_dict))])
        for x in effect_data:
            day, entry = x.split(':')
            if day not in days_dict:
                continue
            day = days_dict[day]
            entry = entry.split(',')
            for o, y in enumerate(entry):
                name, num = y.split('=')
                num = float(num)
                name = get_id(effect_dict, name)
                mmax[o] = max(mmax[o], num)
                effect_list[day][name] = num

        tot_cost = 0
        for x in effect_list: tot_cost += x[2]
        data[1]=tot_cost

        # normalized
        for o, x in enumerate(effect_list):
            effect_arr = np.array(effect_list[o])
            effect_arr = 2 * (effect_arr / np.array([max(mmax[i], 0.0000000001) for i in range(len(mmax))]) - 0.5)
            effect_list[o] = effect_arr.tolist()
        data.append(effect_list)
        
        # pos_ratio
        pos_list = [[0.0] * len(pos_dict) for _ in range(days_num)]
        pos_data = row['pos_ratio'].split(';')
        for x in pos_data:
            day, entry = x.split(':')
            if day not in days_dict:
                continue
            day = days_dict[day]
            entry = entry.split(',')
            for y in entry:
                name, num = y.split('=')
                num = float(num)
                name = get_id(pos_dict, name)
                pos_list[day][name] = num
        data.append(pos_list)
        
        # direct info
        direct_list = [[0.0] * len(direct_dict) for _ in range(days_num)]
        direct_mask = [[0.0] * len(direct_dict) for _ in range(days_num)]
        direct_data = row['direct_type_price'].split(';')
        for x in direct_data:
            day, entry = x.split(':')
            if day not in days_dict:
                continue
            day = days_dict[day]
            entry = entry.split(',')
            for y in entry:
                name, num = y.split('=')
                num = float(num)
                name = get_id(direct_dict, name)
                direct_list[day][name] = num
                direct_mask[day][name] = 1.0
        data.append(direct_list)
        data.append(direct_mask)
        
        # actions info
        direct_type_list = [[] for _ in range(days_num)]
        direct_val_list = [[] for _ in range(days_num)]
        pos_type_list = [[] for _ in range(days_num)]
        pos_val_list = [[] for _ in range(days_num)]
        actions_data = row['actions'].split(';')
        for x in actions_data:
            day, entry = x.split(':')
            if day not in days_dict:
                continue
            day = days_dict[day]
            entry = entry.split(',')
            for y in entry:
                if len(y) == 0:
                    continue
                a, b = y.split('-', 1)
                if a == '修改定向':
                    a, b = b.split('->')
                    aa = a.split('-')[0] # direct type
                    bb = a.split('-')[-1]
                    cc = b.split('-')[0]
                    bb = float(bb) / 100.0 # old price
                    cc = float(cc) / 100.0 # new price
                    if len(direct_type_list[day]) < 100:
                        direct_type_list[day].append(get_id(direct_dict, aa))
                        #direct_val_list[day].append(1)
                        direct_val_list[day].append(cc - bb)
                if a == '新增定向':
                    b = b.split('-')
                    aa = b[0]
                    bb = b[-2]
                    bb = float(bb) / 100.0
                    if len(direct_type_list[day]) < 100:
                        direct_type_list[day].append(len(direct_dict) + get_id(direct_dict, aa))
                        #direct_val_list[day].append(1)
                        direct_val_list[day].append(bb)
                if a == '移除定向':
                    b = b.split('-')
                    aa = b[0]
                    bb = b[-2]
                    bb = float(bb) / 100.0
                    if len(direct_type_list[day]) < 100:
                        direct_type_list[day].append(len(direct_dict) + len(direct_dict) + get_id(direct_dict, aa))
                        #direct_val_list[day].append(1)
                        direct_val_list[day].append(bb)
                if a == '新增资源位':
                    b = b.split('-')
                    aa = b[0]
                    bb = b[2]
                    bb = float(bb) / 100.0
                    if len(pos_type_list[day]) < 100:
                        if aa == '23':
                            pos_type_list[day].append(0)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(bb)
                        if aa == '24':
                            pos_type_list[day].append(1)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(bb)
                        if aa == '25':
                            pos_type_list[day].append(2)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(bb)
                if a == '修改资源位':
                    a, b = b.split('->')
                    aa = a.split('-')[0]
                    bb = a.split('-')[-1]
                    cc = b.split('-')[0]
                    bb = float(bb) / 100.0
                    cc = float(cc) / 100.0
                    if len(pos_type_list[day]) < 100:
                        if aa == '23':
                            pos_type_list[day].append(3)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(cc - bb)
                        if aa == '24':
                            pos_type_list[day].append(4)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(cc - bb)
                        if aa == '25':
                            pos_type_list[day].append(5)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(cc - bb)
                if a == '移除资源位':
                    b = b.split('-')
                    aa = b[0]
                    bb = b[2]
                    bb = float(bb) / 100.0
                    if len(pos_type_list[day]) < 100:
                        if aa == '23':
                            pos_type_list[day].append(6)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(bb)
                        if aa == '24':
                            pos_type_list[day].append(7)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(bb)
                        if aa == '25':
                            pos_type_list[day].append(8)
                            #pos_val_list[day].append(1)
                            pos_val_list[day].append(bb)
        data.append(direct_type_list)
        data.append(direct_val_list)
        data.append(pos_type_list)
        data.append(pos_val_list)
        
        data_type = row['version']
        if data_type == 'train':
            train_set.append(data)
        elif data_type == 'test':
            test_set.append(data)
    print("train num: %d." % len(train_set))
    print("test num: %d." % len(test_set))
    train_data = dataIterator(train_set, batch_size=batch_size, shuffle_each_epoch=shuffle_each_epoch)
    test_data = dataIterator(test_set, batch_size=batch_size, shuffle_each_epoch=False)
    return train_data, test_data
예제 #10
0
print('total chars', len(worddicts))
worddicts_r = [None] * len(worddicts)
for kk, vv in worddicts.items():
    worddicts_r[vv] = kk

reworddicts = load_dict(dictionaries[1])
print('total relations', len(reworddicts))
reworddicts_r = [None] * len(reworddicts)
for kk, vv in reworddicts.items():
    reworddicts_r[vv] = kk

train, train_uid_list = dataIterator(datasets[0],
                                     datasets[1],
                                     datasets[2],
                                     worddicts,
                                     reworddicts,
                                     batch_size=batch_size,
                                     batch_Imagesize=batch_Imagesize,
                                     maxlen=maxlen,
                                     maxImagesize=maxImagesize)
valid, valid_uid_list = dataIterator(valid_datasets[0],
                                     valid_datasets[1],
                                     valid_datasets[2],
                                     worddicts,
                                     reworddicts,
                                     batch_size=valid_batch_size,
                                     batch_Imagesize=valid_batch_Imagesize,
                                     maxlen=maxlen,
                                     maxImagesize=maxImagesize)
# display
uidx = 0  # count batch