Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Chainer example: convolutional seq2seq')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=48,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=512,
                        help='Number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=6,
                        help='Number of layers')
    parser.add_argument('--head',
                        type=int,
                        default=8,
                        help='Number of heads in attention mechanism')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.1,
                        help='Dropout rate')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        default='./',
                        help='Input directory')
    parser.add_argument('--source',
                        '-s',
                        type=str,
                        default='europarl-v7.fr-en.en',
                        help='Filename of train data for source language')
    parser.add_argument('--target',
                        '-t',
                        type=str,
                        default='europarl-v7.fr-en.fr',
                        help='Filename of train data for target language')
    parser.add_argument('--source-valid',
                        '-svalid',
                        type=str,
                        default='dev/newstest2013.en',
                        help='Filename of validation data for source language')
    parser.add_argument('--target-valid',
                        '-tvalid',
                        type=str,
                        default='dev/newstest2013.fr',
                        help='Filename of validation data for target language')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--source-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of source language')
    parser.add_argument('--target-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of target language')
    parser.add_argument('--no-bleu',
                        '-no-bleu',
                        action='store_true',
                        help='Skip BLEU calculation')
    parser.add_argument('--use-label-smoothing',
                        action='store_true',
                        help='Use label smoothing for cross entropy')
    parser.add_argument('--embed-position',
                        action='store_true',
                        help='Use position embedding rather than sinusoid')
    parser.add_argument('--use-fixed-lr',
                        action='store_true',
                        help='Use fixed learning rate rather than the ' +
                        'annealing proposed in the paper')
    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=4))

    # Check file
    en_path = os.path.join(args.input, args.source)
    source_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(en_path, args.source_vocab)
    source_data = preprocess.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target)
    target_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(fr_path, args.target_vocab)
    target_data = preprocess.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    print('Original training data size: %d' % len(source_data))
    train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                  if 0 < len(s) < 50 and 0 < len(t) < 50]
    print('Filtered training data size: %d' % len(train_data))

    en_path = os.path.join(args.input, args.source_valid)
    source_data = preprocess.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target_valid)
    target_data = preprocess.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                 if 0 < len(s) and 0 < len(t)]

    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    # Define Model
    model = net.Transformer(args.layer,
                            min(len(source_ids), len(source_words)),
                            min(len(target_ids), len(target_words)),
                            args.unit,
                            h=args.head,
                            dropout=args.dropout,
                            max_length=500,
                            use_label_smoothing=args.use_label_smoothing,
                            embed_position=args.embed_position)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    # Setup Optimizer
    optimizer = chainer.optimizers.Adam(alpha=5e-5,
                                        beta1=0.9,
                                        beta2=0.98,
                                        eps=1e-9)
    optimizer.setup(model)

    # Setup Trainer
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)
    iter_per_epoch = len(train_data) // args.batchsize
    print('Number of iter/epoch =', iter_per_epoch)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=seq2seq_pad_concat_convert,
                                       device=args.gpu)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # If you want to change a logging interval, change this number
    log_trigger = (min(200, iter_per_epoch // 2), 'iteration')

    def floor_step(trigger):
        floored = trigger[0] - trigger[0] % log_trigger[0]
        if floored <= 0:
            floored = trigger[0]
        return (floored, trigger[1])

    # Validation every half epoch
    eval_trigger = floor_step((iter_per_epoch // 2, 'iteration'))
    record_trigger = training.triggers.MinValueTrigger('val/main/perp',
                                                       eval_trigger)

    evaluator = extensions.Evaluator(test_iter,
                                     model,
                                     converter=seq2seq_pad_concat_convert,
                                     device=args.gpu)
    evaluator.default_name = 'val'
    trainer.extend(evaluator, trigger=eval_trigger)

    # Use Vaswan's magical rule of learning rate(Eq. 3 in the paper)
    # But, the hyperparamter in the paper seems to work well
    # only with a large batchsize.
    # If you run on popular setup (e.g. size=48 on 1 GPU),
    # you may have to change the hyperparamter.
    # I scaled learning rate by 0.5 consistently.
    # ("scale" is always multiplied to learning rate.)

    # If you use a shallow layer network (<=2),
    # you may not have to change it from the paper setting.
    if not args.use_fixed_lr:
        trainer.extend(
            # VaswaniRule('alpha', d=args.unit, warmup_steps=4000, scale=1.),
            # VaswaniRule('alpha', d=args.unit, warmup_steps=32000, scale=1.),
            # VaswaniRule('alpha', d=args.unit, warmup_steps=4000, scale=0.5),
            # VaswaniRule('alpha', d=args.unit, warmup_steps=16000, scale=1.),
            VaswaniRule('alpha', d=args.unit, warmup_steps=64000, scale=1.),
            trigger=(1, 'iteration'))
    observe_alpha = extensions.observe_value(
        'alpha', lambda trainer: trainer.updater.get_optimizer('main').alpha)
    trainer.extend(observe_alpha, trigger=(1, 'iteration'))

    # Only if a model gets best validation score,
    # save (overwrite) the model
    trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                   trigger=record_trigger)

    def translate_one(source, target):
        words = preprocess.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array([source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x], beam=5)[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one('Who are we ?', 'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words[i] for i in source])
        target = ' '.join([target_words[i] for i in target])
        translate_one(source, target)

    # Gereneration Test
    trainer.extend(translate, trigger=(min(200, iter_per_epoch), 'iteration'))

    # Calculate BLEU every half epoch
    if not args.no_bleu:
        trainer.extend(CalculateBleu(model,
                                     test_data,
                                     'val/main/bleu',
                                     device=args.gpu,
                                     batch=args.batchsize // 4),
                       trigger=floor_step((iter_per_epoch // 2, 'iteration')))

    # Log
    trainer.extend(extensions.LogReport(trigger=log_trigger),
                   trigger=log_trigger)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'val/main/loss', 'main/perp',
        'val/main/perp', 'main/acc', 'val/main/acc', 'val/main/bleu', 'alpha',
        'elapsed_time'
    ]),
                   trigger=log_trigger)

    print('start training')
    trainer.run()
Exemplo n.º 2
0
        # フィルタごとにループ

        for i, filter_height in enumerate(self.filter_height_list):
            xcs[i] = F.relu(self[i](exs))
            chs[i] = F.max_pooling_2d(
                xcs[i], (self.max_sentence_size + 1 - filter_height))
        # Convolution+Poolingの結果の結合
        h = F.concat(chs, axis=2)
        h = F.dropout(F.tanh(self[self.convolution_num + 0](h)))
        y = self[self.convolution_num + 1](h)
        return y


# 学習
en_path = os.path.join("./", "train/body.txt")
source_vocab = ['<eos>', '<unk>', '<bos>'] + preprocess.count_words(
    en_path, 900)
source_data = preprocess.make_dataset(en_path, source_vocab)
source_ids = {word: index for index, word in enumerate(source_vocab)}
words = {i: w for w, i in source_ids.items()}
N = len(source_data)

words[len(words)] = "padding"

a = [0] * (1000)
b = [1] * (1000)
data_t = a + b

max_len = 0
for k in range(len(source_data)):
    if max_len < len(source_data[k]):
        max_len = len(source_data[k])
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Chainer example: Att_summary')
    parser.add_argument('--source',
                        '-s',
                        type=str,
                        default='test/body.txt',
                        help='source sentence list')
    parser.add_argument('--target',
                        '-t',
                        type=str,
                        default='test/sum.txt',
                        help='target sentence list')
    parser.add_argument('--source_valid',
                        type=str,
                        default='test/body.txt',
                        help='source sentence list for validation')
    parser.add_argument('--target_valid',
                        type=str,
                        default='test/sum.txt',
                        help='target sentence list for validation')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='number of sentence pairs in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=200,
                        help='number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=200,
                        help='number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=1,
                        help='number of layers')
    parser.add_argument('--log_interval',
                        type=int,
                        default=20,
                        help='number of iteration to show log')
    parser.add_argument('--validation_interval',
                        type=int,
                        default=20,
                        help='number of iteration to evlauate the model '
                        'with validation dataset')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='directory to output the result')
    args = parser.parse_args()

    # Load pre-processed dataset
    print('[{}] Loading dataset... (this may take several minutes)'.format(
        datetime.datetime.now()))

    en_path = os.path.join("./", args.source)
    #引数は語彙数
    source_vocab = ['<eos>', '<unk>', '<bos>'] + preprocess.count_words(
        en_path, 18000)
    source_data = preprocess.make_dataset(en_path, source_vocab)
    fr_path = os.path.join("./", args.target)
    target_vocab = ['<eos>', '<unk>', '<bos>'] + preprocess.count_words(
        fr_path, 18000)
    target_data = preprocess.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    print('Original training data size: %d' % len(source_data))
    train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)]
    print('Filtered training data size: %d' % len(train_data))

    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}
    #{}の中身を入れ換え
    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    # Setup model
    stack = 1
    width = 3
    print("aaa")
    model = Att(args.layer, len(source_ids), len(target_ids), args.unit,
                args.batchsize, WEIGHT, stack, width)
    print("aaa")
    if args.gpu >= 0:
        print("aaa")
        chainer.backends.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)
    # Setup optimizer
    print("aaa")
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    # Setup iterator
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    # Setup updater and trainer
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=seq2seq_pad_concat_convert,
                                       device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'elapsed_time'
    ]),
                   trigger=(1, 'epoch'))

    if args.source_valid and args.target_valid:
        en_path = os.path.join("./", args.source_valid)
        source_data = preprocess.make_dataset(en_path, source_vocab)
        fr_path = os.path.join("./", args.target_valid)
        target_data = preprocess.make_dataset(fr_path, target_vocab)
        assert len(source_data) == len(target_data)
        test_data = [(s, t)
                     for s, t in six.moves.zip(source_data, target_data)]
        test_source_unknown = calculate_unknown_ratio(
            [s for s, _ in test_data])
        test_target_unknown = calculate_unknown_ratio(
            [t for _, t in test_data])

        print('Validation data: %d' % len(test_data))
        print('Validation source unknown ratio: %.2f%%' %
              (test_source_unknown * 100))
        print('Validation target unknown ratio: %.2f%%' %
              (test_target_unknown * 100))

        @chainer.training.make_extension()
        def translate_one(trainer):
            #訓練文での生成の場合 defaltは0番目

            a, b = map(list, zip(*train_data))
            source = a[21]
            target = b[21]
            result = model.translate(source)[0]
            """
            #テストランダムの場合
            source, target = test_data[numpy.random.choice(len(test_data))]
            result = model.translate([model.xp.array(source)])[0]
            """
            source_sentence = ' '.join([source_words[x] for x in source])
            target_sentence = ' '.join([target_words[y] for y in target])
            result_sentence = ' '.join([target_words[y] for y in result])
            print('# source : ' + source_sentence)
            print('# result : ' + result_sentence)
            print('# expect : ' + target_sentence)

            if WEIGHT:
                with open("weight/wei.txt", "a", encoding="utf-8") as f:
                    f.write("<body> <fos> " + str(source_sentence) + "\n")
                    f.write("<generation> <fos> " + str(result_sentence) +
                            "\n")

        def translate_all(trainer):
            a, b = map(list, zip(*test_data))
            for k in range(len(test_data)):
                source = a[k]
                result = model.translate(source)[0]
                result_sentence = ' '.join([target_words[y] for y in result])
                with open("summary/result.txt", "a", encoding="utf-8") as f:
                    f.write(str(result_sentence) + "\n")
            sys.exit(0)

        """
        if TRANS_ALL:
            trainer.extend(translate_all, trigger=(19, 'epoch'))
        """
        trainer.extend(translate_one,
                       trigger=(args.validation_interval, 'epoch'))
        if TRANS_ALL:
            trainer.extend(translate_all, trigger=(20, 'epoch'))
        test_iter = chainer.iterators.SerialIterator(test_data, args.batchsize,
                                                     False, False)
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=seq2seq_pad_concat_convert,
                                 device=args.gpu))
        trainer.extend(extensions.snapshot(), trigger=(10, 'epoch'))
        #trainer.extend(extensions.PlotReport(
        #    ['main/loss','validation/main/loss'] ,x_key='epoch', file_name='loss.png'))
    print('start training')
    """
    #save_model_load
    filename = "./result/snapshot_iter_779"
    serializers.load_npz(filename, trainer)   
    """
    trainer.run()
Exemplo n.º 4
0
def main():
    args = get_args()
    app = Flask(__name__)

    source_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(args.source, args.source_vocab)
    source_data = preprocess.make_dataset(args.source, source_vocab)
    target_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(args.target, args.target_vocab)
    target_data = preprocess.make_dataset(args.target, target_vocab)
    
    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    chainer.using_config('train', False)
    chainer.no_backprop_mode()
    model = net.Transformer(
        args.layer,
        min(len(source_ids), len(source_words)),
        min(len(target_ids), len(target_words)),
        args.unit,
        h=args.head,
        dropout=args.dropout,
        max_length=500,
        use_label_smoothing=args.use_label_smoothing,
        embed_position=args.embed_position)

    chainer.serializers.load_npz(args.model, model)

    m = MeCab('-Owakati')

    def translate_one(source):
        words = preprocess.split_sentence(source)
        #print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x], beam=5)[0]
        words = [target_words[y] for y in ys]
        return ''.join(words)
    
    @app.route('/', methods=['GET', 'POST'])
    def post():
        title = '日経記事要約 by transformer'
        if request.method == 'GET':
            message = '日経に関連する記事を入力してください'
            return render_template('index.html',
                                   message=message, title=title)
        elif request.method == 'POST':
            body = request.form['body']
            body = m.parse(normalize(body))
            abst = translate_one(body)
            return render_template('index.html',
                                   body=body, title=title, abst=abst)
        else:
            return redirect(url_for('index'))
    
    app.debug = True
    app.run(host='localhost')
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description='Chainer example: convolutional seq2seq')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=48,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=512,
                        help='Number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=6,
                        help='Number of layers')
    parser.add_argument('--head',
                        type=int,
                        default=8,
                        help='Number of heads in attention mechanism')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.1,
                        help='Dropout rate')
    parser.add_argument('--model', type=str, help='trained model')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        default='./',
                        help='Input directory')
    parser.add_argument('--source',
                        '-s',
                        type=str,
                        default='europarl-v7.fr-en.en',
                        help='Filename of train data for source language')
    parser.add_argument('--target',
                        '-t',
                        type=str,
                        default='europarl-v7.fr-en.fr',
                        help='Filename of train data for target language')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--source-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of source language')
    parser.add_argument('--target-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of target language')
    parser.add_argument('--no-bleu',
                        '-no-bleu',
                        action='store_true',
                        help='Skip BLEU calculation')
    parser.add_argument('--use-label-smoothing',
                        action='store_true',
                        help='Use label smoothing for cross entropy')
    parser.add_argument('--embed-position',
                        action='store_true',
                        help='Use position embedding rather than sinusoid')
    parser.add_argument('--use-fixed-lr',
                        action='store_true',
                        help='Use fixed learning rate rather than the ' +
                        'annealing proposed in the paper')
    parser.add_argument('--disable-mecab',
                        '--dm',
                        action='store_true',
                        help='disalbe mecab toknize')
    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=4))

    # Check file
    en_path = os.path.join(args.input, args.source)
    source_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(en_path, args.source_vocab)
    source_data = preprocess.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target)
    target_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(fr_path, args.target_vocab)
    # print('Original training data size: %d' % len(source_data))
    # print('Filtered training data size: %d' % len(train_data))

    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    m = MeCab('-Owakati')

    # Define Model
    model = net.Transformer(args.layer,
                            min(len(source_ids), len(source_words)),
                            min(len(target_ids), len(target_words)),
                            args.unit,
                            h=args.head,
                            dropout=args.dropout,
                            max_length=500,
                            use_label_smoothing=args.use_label_smoothing,
                            embed_position=args.embed_position)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    chainer.serializers.load_npz(args.model, model)

    def translate_one(source, target):
        words = preprocess.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array([source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x], beam=5)[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    def tokenize(source, target):
        if args.disable_mecab:
            return source, target
        return m.parse(source), m.parse(target)

    while True:
        source = input('source> ')
        target = input('target> ')
        source, target = tokenize(source, target)
        translate_one(source, target)