예제 #1
0
def read_target(in_dir, cahce=None):
    fr_path = os.path.join(in_dir, 'giga-fren.release2.fixed.fr')
    target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path)
    target_data = europal.make_dataset(fr_path, target_vocab)

    return target_vocab, target_data
예제 #2
0
def read_source(in_dir, cache=None):
    en_path = os.path.join(in_dir, 'giga-fren.release2.fixed.en')
    source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path)
    source_data = europal.make_dataset(en_path, source_vocab)

    return source_vocab, source_data
예제 #3
0
파일: seq2seq.py 프로젝트: asi1024/chainer
def read_target(in_dir, cahce=None):
    fr_path = os.path.join(in_dir, 'giga-fren.release2.fixed.fr')
    target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path)
    target_data = europal.make_dataset(fr_path, target_vocab)

    return target_vocab, target_data
def main():
    parser = argparse.ArgumentParser(
        description='Chainer example: convolutional seq2seq')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=512,
                        help='Number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=15,
                        help='Number of layers')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        default='./',
                        help='Input directory')
    parser.add_argument('--source',
                        '-s',
                        type=str,
                        default='europarl-v7.fr-en.en',
                        help='Filename of train data for source language')
    parser.add_argument('--target',
                        '-t',
                        type=str,
                        default='europarl-v7.fr-en.fr',
                        help='Filename of train data for target language')
    parser.add_argument('--source-valid',
                        '-svalid',
                        type=str,
                        default='dev/newstest2013.en',
                        help='Filename of validation data for source language')
    parser.add_argument('--target-valid',
                        '-tvalid',
                        type=str,
                        default='dev/newstest2013.fr',
                        help='Filename of validation data for target language')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--source-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of source language')
    parser.add_argument('--target-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of target language')
    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=4))

    # Check file
    en_path = os.path.join(args.input, args.source)
    source_vocab = ['<eos>', '<unk>'] + \
        europal.count_words(en_path, args.source_vocab)
    source_data = europal.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target)
    target_vocab = ['<eos>', '<unk>'] + \
        europal.count_words(fr_path, args.target_vocab)
    target_data = europal.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    print('Original training data size: %d' % len(source_data))
    train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                  if 0 < len(s) < 50 and 0 < len(t) < 50]
    print('Filtered training data size: %d' % len(train_data))

    en_path = os.path.join(args.input, args.source_valid)
    source_data = europal.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target_valid)
    target_data = europal.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                 if 0 < len(s) and 0 < len(t)]

    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    # Define Model
    model = net.Seq2seq(args.layer, len(source_ids), len(target_ids),
                        args.unit)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    # Setup Optimizer
    optimizer = chainer.optimizers.NesterovAG(lr=0.25, momentum=0.99)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(0.1))

    # Setup Trainer
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)
    iter_per_epoch = len(train_data) // args.batchsize
    print('Number of iter/epoch =', iter_per_epoch)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=seq2seq_pad_concat_convert,
                                       device=args.gpu)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # If you want to change a logging interval, change this number
    log_trigger = (min(1000, iter_per_epoch // 2), 'iteration')

    def floor_step(trigger):
        floored = trigger[0] - trigger[0] % log_trigger[0]
        if floored <= 0:
            floored = trigger[0]
        return (floored, trigger[1])

    # Validation every half epoch
    eval_trigger = floor_step((iter_per_epoch // 2, 'iteration'))
    fail_trigger = FailMinValueTrigger('val/main/perp', eval_trigger)
    record_trigger = training.triggers.MinValueTrigger('val/main/perp',
                                                       eval_trigger)

    evaluator = extensions.Evaluator(test_iter,
                                     model,
                                     converter=seq2seq_pad_concat_convert,
                                     device=args.gpu)
    evaluator.default_name = 'val'
    trainer.extend(evaluator, trigger=eval_trigger)
    # Only if validation perplexity fails to be improved,
    # lr is decayed (until 1e-4).
    trainer.extend(extensions.ExponentialShift('lr', 0.1, target=1e-4),
                   trigger=fail_trigger)
    trainer.extend(extensions.observe_lr(), trigger=eval_trigger)
    # Only if a model gets best validation score,
    # save the model
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}.npz'),
                   trigger=record_trigger)

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array([source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one('Who are we ?', 'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words[i] for i in source])
        target = ' '.join([target_words[i] for i in target])
        translate_one(source, target)

    # Gereneration Test
    trainer.extend(translate, trigger=(min(200, iter_per_epoch), 'iteration'))
    # Calculate BLEU every half epoch
    trainer.extend(CalculateBleu(model,
                                 test_data,
                                 'val/main/bleu',
                                 device=args.gpu,
                                 batch=args.batchsize // 4),
                   trigger=floor_step((iter_per_epoch // 2, 'iteration')))

    # Log
    trainer.extend(extensions.LogReport(trigger=log_trigger),
                   trigger=log_trigger)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'val/main/loss', 'main/perp',
        'val/main/perp', 'main/acc', 'val/main/acc', 'val/main/bleu', 'lr',
        'elapsed_time'
    ]),
                   trigger=log_trigger)

    print('start training')
    trainer.run()
예제 #5
0
파일: seq2seq.py 프로젝트: asi1024/chainer
def read_source(in_dir, cache=None):
    en_path = os.path.join(in_dir, 'giga-fren.release2.fixed.en')
    source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path)
    source_data = europal.make_dataset(en_path, source_vocab)

    return source_vocab, source_data