예제 #1
0
def infer(args):
    states = {}
    score_manager = ScoreManager(args.model_path, 'evaluation_score')

    if args.cuda:
        logger.info('[PARAM] Enabling CUDA')
        assert torch.cuda.is_available()

    logger.info("[DATASET] Preparing dataset...")
    train_iter, val_iter, test_iter, fields = data_utils.load_dataset(args, is_eval=True)
    if 'src' in fields:
        src_vocab_size, tgt_vocab_size = len(fields.get('src', None).vocab), len(fields['tgt'].vocab)
    else:
        tgt_vocab_size = len(fields['tgt'].vocab)
    logger.info("[VALIDATION]: #Batches=%d (#Cases:%d))" % (len(val_iter), len(val_iter.dataset)))
    logger.info("[TEST]: #Batches=%d (#Cases:%d))" % (len(test_iter), len(test_iter.dataset)))

    logger.info("[PARAM] Setting vocab sizes")
    if args.copy:
        vocab_offset = args.max_copy_token_num
    else:
        vocab_offset = 0

    if args.field_copy:
        vocab_offset += args.max_kw_pairs_num

    if 'src' in fields:
        args.__setattr__('src_vocab_size', src_vocab_size - vocab_offset)
        args.__setattr__('src_tag_vocab_size', len(fields['src_tag'].vocab))
    args.__setattr__('tgt_vocab_size', tgt_vocab_size - vocab_offset)
    args.__setattr__('tgt_vocab_size_with_offsets', tgt_vocab_size)
    if 'attribute_key' in fields:
        args.__setattr__('field_vocab_size', len(fields['attribute_key'].vocab))
        args.__setattr__('field_word_vocab_size', len(fields['attribute_word'].vocab))
        if args.field_tag_usage != 'none':
            args.__setattr__('field_pos_tag_vocab_size', len(fields['attribute_word_tag'].vocab))
    set_dynamic_vocabs(args, fields)

    generation_name = model_helper.name_a_generation(args, 'test')
    file_prefix = os.path.join(args.model_path, generation_name)

    if not args.skip_infer:
        logger.info("[MODEL] Preparing model...")
        seq2seq = InfoSeq2Seq.build_s2s_model(args, fields.get('src', None), fields['tgt'])
        model_helper.show_parameters(seq2seq)
        if args.cuda:
            seq2seq = seq2seq.to('cuda')
        logger.info(seq2seq)

        # Load model
        is_loaded = model_helper.try_restore_model(args.model_path, seq2seq, None, states, best_model=args.use_best_model)

        if not is_loaded:
            logger.info("[PARAM] Could not load a trained model!")
            return

        # Test Set
        if args.field_copy:
            raw_queries, raw_responses, raw_fields = data_utils.load_examples(args.test_data_path_prefix, field_words=True)
        else:
            raw_queries, raw_responses = data_utils.load_examples(args.test_data_path_prefix)
            raw_fields = [None] * len(raw_queries)
        output_queries, output_responses, output_scores, output_generations, test_loss, test_ppl_macro  = \
            inference(args, seq2seq, test_iter, tgt_vocab_size, fields.get('src', None), fields['tgt'])
        model_helper.write_results(args, file_prefix, raw_queries, raw_responses, output_scores, output_generations,
                                   raw_fields=raw_fields)

    logger.info('[STD Evaluation] Evaluating the test generations...')
    res_dict = std_eval_with_args(args, file_prefix, 'test')
    score_manager.update_group(generation_name, res_dict)

    if args.beam_width == 0 and args.skip_infer is False:
        test_ppl = math.exp(test_loss)
        score_manager.update('infer_b%d_test_loss' % args.beam_width, test_loss)
        score_manager.update('infer_b%d_test_ppl' % args.beam_width, test_ppl)
        score_manager.update('infer_b%d_test_macro_ppl' % args.beam_width, test_ppl_macro)
예제 #2
0
def train(args):
    states = {
        'val_loss': [],
        'val_ppl': [],
        'lr': args.lr,
        'epoch': 0,
        'best_epoch': 0,
        'best_val_loss': -1,
    }
    if args.cuda:
        logger.info('[PARAM] Enabling CUDA')
        assert torch.cuda.is_available()

    logger.info("[DATASET] Preparing dataset...")
    train_iter, val_iter, test_iter, fields = data_utils.load_dataset(args)
    if 'src' in fields:
        src_vocab_size, tgt_vocab_size = len(fields.get('src', None).vocab), len(fields['tgt'].vocab)
    else:
        tgt_vocab_size = len(fields['tgt'].vocab)
    logger.info("[TRAIN]: #Batches=%d (#Cases:%d))" % (len(train_iter), len(train_iter.dataset)))
    logger.info("[VALIDATION]: #Batches=%d (#Cases:%d))" % (len(val_iter), len(val_iter.dataset)))
    logger.info("[TEST]: #Batches=%d (#Cases:%d))" % (len(test_iter), len(test_iter.dataset)))

    logger.info("[PARAM] Setting vocab sizes")
    if args.copy:
        vocab_offset = args.max_copy_token_num
    else:
        vocab_offset = 0

    if args.field_copy:
        vocab_offset += args.max_kw_pairs_num


    args.__setattr__('tgt_vocab_size', tgt_vocab_size - vocab_offset)
    if 'attribute_key' in fields:
        args.__setattr__('field_vocab_size', len(fields['attribute_key'].vocab))
    if 'src' in fields:
        args.__setattr__('src_vocab_size', src_vocab_size - vocab_offset)
        args.__setattr__('src_tag_vocab_size', len(fields['src_tag'].vocab))
    if 'attribute_word' in fields:
        args.__setattr__('field_word_vocab_size', len(fields['attribute_word'].vocab))
    if 'attribute_word' in fields and args.field_tag_usage != 'none':
        args.__setattr__('field_pos_tag_vocab_size', len(fields['attribute_word_tag'].vocab))
    args.__setattr__('tgt_vocab_size_with_offsets', tgt_vocab_size)
    set_dynamic_vocabs(args, fields)
    logger.info('[VOCAB] tgt_vocab_size_with_offsets = ' + str(tgt_vocab_size))


    logger.info("[MODEL] Preparing model...")
    seq2seq = InfoSeq2Seq.build_s2s_model(args, fields.get('src', None), fields['tgt'])

    if args.cuda:
        seq2seq.to('cuda')

    model_helper.show_parameters(seq2seq)
    if args.init_lr > 0:
        logger.info("[LR] Using init LR %f" % args.init_lr)
        optimizer = optim.Adam(seq2seq.parameters(), lr=args.init_lr)
        states['lr'] = args.init_lr
    else:
        optimizer = optim.Adam(seq2seq.parameters(), lr=args.lr)
        states['lr'] = args.lr
    logger.info(seq2seq)

    # Load model
    is_loaded = model_helper.try_restore_model(args.model_path, seq2seq, optimizer, states, best_model=False)
    if not is_loaded:
        logger.info("[PARAM] Using fresh params")
        model_helper.init_network(seq2seq, args.init)
        if args.init_word_vecs is True:
            assert args.pre_embed_dim == args.embed_size
            logger.info("[EMBED] Loading the pre-trained word_embeddings")
            # 使用预训练的Embedding
            assert args.embed_size == args.field_key_embed_size
            if 'attribute_key' in fields:
                data_utils.load_pretrain_embeddings(seq2seq.field_encoder.field_key_embed, fields['attribute_key'],
                                                    args.pre_embed_file, args.embed_size, char2word='avg',
                                                    suffix=args.dataset_version + 'attribute_key')
                if args.field_word_vocab_path != 'none':
                    data_utils.load_pretrain_embeddings(seq2seq.field_encoder.field_word_embedding,
                                                        fields['attribute_word'],
                                                        args.pre_embed_file, args.embed_size, char2word='avg',
                                                        suffix=args.dataset_version + 'attribute_word')
                if 'sub_attribute_key' in fields:
                    data_utils.load_pretrain_embeddings(seq2seq.field_encoder.sub_field_key_char_encoder.sub_embed,
                                                        fields.get('sub_attribute_key', None),
                                                        args.pre_embed_file, args.embed_size,
                                                        suffix=args.dataset_version + 'sub_attribute_key')
                if 'sub_attribute_word' in fields:
                    data_utils.load_pretrain_embeddings(seq2seq.field_encoder.sub_field_word_char_encoder.sub_embed,
                                                        fields.get('sub_attribute_word', None),
                                                        args.pre_embed_file, args.embed_size,
                                                        suffix=args.dataset_version + 'sub_attribute_word')
            if 'src' in fields:
                data_utils.load_pretrain_embeddings(seq2seq.src_embed, fields.get('src', None),
                                                    args.pre_embed_file, args.embed_size,
                                                    suffix=args.dataset_version+'src')
            if 'sub_src' in fields:
                data_utils.load_pretrain_embeddings(seq2seq.sub_src_embed, fields.get('sub_src', None),
                                                    args.pre_embed_file, args.embed_size,
                                                    suffix=args.dataset_version+'subsrc')
            if not args.share_embedding or 'src' not in fields:
                data_utils.load_pretrain_embeddings(seq2seq.dec_embed, fields['tgt'], args.pre_embed_file,
                                                    args.embed_size, suffix=args.dataset_version+'tgt')


    start_epoch = states['epoch']
    for epoch in range(start_epoch + 1, args.epochs + 1):
        if epoch != start_epoch + 1 and (args.align_dropout > 0.0 and (args.copy or args.field_copy)):
            logger.info("[Dataset] Reloading & Aligning the dataset")
            train_iter, val_iter, test_iter, fields = data_utils.load_dataset(args)
        if epoch - states['best_epoch'] > 2:
            logger.info('[STOP] Early Stopped !')
            break
        start_time = time.time()
        num_batches = len(train_iter)
        logger.info('[NEW EPOCH] %d/%d, num of batches : %d' % (epoch, args.epochs, num_batches))
        train_epoch(args, epoch, seq2seq, optimizer, train_iter,
                    tgt_vocab_size, fields.get('src', None), fields['tgt'])
        val_loss, val_ppl_macro = evaluate(args, seq2seq, val_iter, tgt_vocab_size, fields.get('src', None), fields['tgt'])
        val_ppl = math.exp(val_loss)
        test_loss, test_ppl_macro = evaluate(args, seq2seq, test_iter, tgt_vocab_size, fields.get('src', None), fields['tgt'])
        test_ppl = math.exp(test_loss)

        logger.info("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2f/%5.2f | test_loss:%5.3f | test_pp:%5.2f/%5.2f"
                    % (epoch, val_loss, val_ppl, val_ppl_macro, test_loss, test_ppl, test_ppl_macro))
        time_diff = (time.time() - start_time) / 3600.0
        logger.info("[Epoch:%d] epoch time:%.2fH, est. remaining  time:%.2fH" %
                    (epoch, time_diff, time_diff * (args.epochs - epoch)))

        # Adjusting learning rate
        states['val_ppl'].append(val_ppl)
        states['val_loss'].append(val_loss)
        states['epoch'] = epoch
        if len(states['val_ppl']) >= 2:
            logger.info('[TRAINING] last->now valid ppl : %.3f->%.3f' % (states['val_ppl'][-2], states['val_ppl'][-1]))

        if len(states['val_ppl']) >= 2 and states['val_ppl'][-1] >= states['val_ppl'][-2]:
            logger.info('[TRAINING] Adjusting learning rate due to the increment of val_loss')
            new_lr = model_helper.adjust_learning_rate(optimizer, rate=args.lr_decay_rate)
            states['lr'] = new_lr
        else:
            if args.init_lr > 0 and epoch >= args.init_lr_decay_epoch - 1 and states['lr'] > args.lr:
                logger.info('[TRAINING] Try to decay the init decay, from epoch %d, and the next epoch is %d' %
                            (args.init_lr_decay_epoch, epoch+1))
                new_lr = model_helper.adjust_learning_rate(optimizer, rate=args.lr_decay_rate, min_value=args.lr)
                states['lr'] = new_lr

        # Save the model if the validation loss is the best we've seen so far.
        if states['best_val_loss'] == -1 or val_ppl < states['best_val_loss']:
            logger.info('[CHECKPOINT] New best valid ppl : %.3f->%.2f' % (states['best_val_loss'], val_ppl))
            model_helper.save_model(args.model_path, epoch, val_ppl, seq2seq, optimizer, args, states, best_model=True,
                       clear_history=True)
            states['best_val_loss'] = val_ppl
            states['best_epoch'] = epoch

        # Saving standard model
        model_helper.save_model(args.model_path, epoch, val_ppl, seq2seq, optimizer, args, states, best_model=False,
                   clear_history=True)
예제 #3
0
def eval(args):
    states = {}
    score_manager = ScoreManager(args.model_path, 'evaluation_score')
    if args.cuda:
        logger.info('[PARAM] Enabling CUDA')
        assert torch.cuda.is_available()

    logger.info("[DATASET] Preparing dataset...")
    train_iter, val_iter, test_iter, fields= data_utils.load_dataset(args, is_eval=True)
    if 'src' in fields:
        src_vocab_size, tgt_vocab_size = len(fields.get('src', None).vocab), len(fields['tgt'].vocab)
    else:
        tgt_vocab_size = len(fields['tgt'].vocab)
    logger.info("[VALIDATION]: #Batches=%d (#Cases:%d))" % (len(val_iter), len(val_iter.dataset)))
    logger.info("[TEST]: #Batches=%d (#Cases:%d))" % (len(test_iter), len(test_iter.dataset)))

    logger.info("[PARAM] Setting vocab sizes")
    if args.copy:
        vocab_offset = args.max_copy_token_num
    else:
        vocab_offset = 0

    if args.field_copy:
        vocab_offset += args.max_kw_pairs_num
    if 'src' in fields:
        args.__setattr__('src_vocab_size', src_vocab_size - vocab_offset)
        args.__setattr__('src_tag_vocab_size', len(fields['src_tag'].vocab))
    args.__setattr__('tgt_vocab_size', tgt_vocab_size - vocab_offset)
    args.__setattr__('tgt_vocab_size_with_offsets', tgt_vocab_size)
    if 'attribute_key' in fields:
        args.__setattr__('field_vocab_size', len(fields['attribute_key'].vocab))
        args.__setattr__('field_word_vocab_size', len(fields['attribute_word'].vocab))
        if args.field_tag_usage != 'none':
            args.__setattr__('field_pos_tag_vocab_size', len(fields['attribute_word_tag'].vocab))
    set_dynamic_vocabs(args, fields)
    logger.info("[MODEL] Preparing model...")
    seq2seq = InfoSeq2Seq.build_s2s_model(args, fields.get('src', None), fields['tgt'])
    model_helper.show_parameters(seq2seq)
    if args.cuda:
        seq2seq = seq2seq.to('cuda')
    logger.info(seq2seq)

    # Load model
    is_loaded = model_helper.try_restore_model(args.model_path, seq2seq, None, states, best_model=args.use_best_model)
    if not is_loaded:
        logger.info("[PARAM] Could not load a trained model!")
        return
    else:
        val_loss, val_ppl_macro = evaluate(args, seq2seq, val_iter, tgt_vocab_size, fields.get('src', None), fields['tgt'])
        val_ppl = math.exp(val_loss)
        test_loss, test_ppl_macro = evaluate(args, seq2seq, test_iter, tgt_vocab_size, fields.get('src', None), fields['tgt'])
        test_ppl = math.exp(test_loss)
        logger.info("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2f/%5.2f | test_loss:%5.3f | test_pp:%5.2f/%5.2f"
                    % (states['epoch'], val_loss, val_ppl, val_ppl_macro, test_loss, test_ppl, test_ppl_macro))

        score_manager.update('eval_val_loss', val_loss)
        score_manager.update('eval_val_ppl', val_ppl)
        score_manager.update('eval_val_macro_ppl', val_ppl_macro)
        score_manager.update('eval_test_loss', test_loss)
        score_manager.update('eval_test_ppl', test_ppl)
        score_manager.update('eval_test_macro_ppl', test_ppl_macro)
예제 #4
0
def train(args):
    states = {
        'val_loss': [],
        'val_ppl': [],
        'lr': args.lr,
        'epoch': 0,
        'best_epoch': 0,
        'best_val_loss': -1,
    }
    if args.cuda:
        logger.info('[PARAM] Enabling CUDA')
        assert torch.cuda.is_available()

    logger.info("[DATASET] Preparing dataset...")
    train_iter, val_iter, test_iter, src_field, tgt_field = data_utils.load_dataset(
        args)
    src_vocab_size, tgt_vocab_size = len(src_field.vocab), len(tgt_field.vocab)
    logger.info("[TRAIN]: #Batches=%d (#Cases:%d))" %
                (len(train_iter), len(train_iter.dataset)))
    logger.info("[VALIDATION]: #Batches=%d (#Cases:%d))" %
                (len(val_iter), len(val_iter.dataset)))
    logger.info("[TEST]: #Batches=%d (#Cases:%d))" %
                (len(test_iter), len(test_iter.dataset)))

    logger.info("[PARAM] Setting vocab sizes")
    if args.copy:
        vocab_offset = args.max_copy_token_num
    else:
        vocab_offset = 0
    args.__setattr__('src_vocab_size', src_vocab_size - vocab_offset)
    args.__setattr__('tgt_vocab_size', tgt_vocab_size - vocab_offset)
    args.__setattr__('tgt_vocab_size_with_offsets', tgt_vocab_size)

    logger.info("[MODEL] Preparing model...")
    seq2seq = Seq2Seq.build_s2s_model(args, src_field, tgt_field)

    if args.cuda:
        seq2seq.to('cuda')

    model_helper.show_parameters(seq2seq)
    optimizer = optim.Adam(seq2seq.parameters(), lr=args.lr)
    logger.info(seq2seq)

    # Load model
    is_loaded = model_helper.try_restore_model(args.model_path,
                                               seq2seq,
                                               optimizer,
                                               states,
                                               best_model=False)
    if not is_loaded:
        logger.info("[PARAM] Using fresh params")
        model_helper.init_network(seq2seq, args.init)
        if args.init_word_vecs is True:
            assert args.pre_embed_dim == args.embed_size
            logger.info("[EMBED] Loading the pre-trained word_embeddings")
            data_utils.load_pretrain_embeddings(seq2seq.src_embed, src_field,
                                                args.pre_embed_file,
                                                args.embed_size)
            if not args.share_embedding:
                data_utils.load_pretrain_embeddings(seq2seq.tgt_embed,
                                                    tgt_field,
                                                    args.pre_embed_file,
                                                    args.embed_size)

    start_epoch = states['epoch']
    for epoch in range(start_epoch + 1, args.epochs + 1):
        if epoch - states['best_epoch'] > 2:
            logger.info('[STOP] Early Stopped !')
            break
        start_time = time.time()
        num_batches = len(train_iter)
        logger.info('[NEW EPOCH] %d/%d, num of batches : %d' %
                    (epoch, args.epochs, num_batches))
        train_epoch(args, epoch, seq2seq, optimizer, train_iter,
                    tgt_vocab_size, src_field, tgt_field)
        val_loss, val_ppl_macro = evaluate(args, seq2seq, val_iter,
                                           tgt_vocab_size, src_field,
                                           tgt_field)
        val_ppl = math.exp(val_loss)
        test_loss, test_ppl_macro = evaluate(args, seq2seq, test_iter,
                                             tgt_vocab_size, src_field,
                                             tgt_field)
        test_ppl = math.exp(test_loss)

        logger.info(
            "[Epoch:%d] val_loss:%5.3f | val_pp:%5.2f/%5.2f | test_loss:%5.3f | test_pp:%5.2f/%5.2f"
            % (epoch, val_loss, val_ppl, val_ppl_macro, test_loss, test_ppl,
               test_ppl_macro))
        time_diff = (time.time() - start_time) / 3600.0
        logger.info("[Epoch:%d] epoch time:%.2fH, est. remaining  time:%.2fH" %
                    (epoch, time_diff, time_diff * (args.epochs - epoch)))

        # Adjusting learning rate
        states['val_ppl'].append(val_ppl)
        states['val_loss'].append(val_loss)
        states['epoch'] = epoch
        if len(states['val_ppl']) >= 2:
            logger.info('[TRAINING] last->now valid ppl : %.3f->%.3f' %
                        (states['val_ppl'][-2], states['val_ppl'][-1]))

        if len(states['val_ppl']) >= 2:
            if states['val_ppl'][-1] >= states['val_ppl'][-2]:
                logger.info(
                    '[TRAINING] Adjusting learning rate due to the increment of val_loss'
                )
                new_lr = model_helper.adjust_learning_rate(optimizer)
                states['lr'] = new_lr

        # Save the model if the validation loss is the best we've seen so far.
        if states['best_val_loss'] == -1 or val_ppl < states['best_val_loss']:
            logger.info('[CHECKPOINT] New best valid ppl : %.3f->%.2f' %
                        (states['best_val_loss'], val_ppl))
            model_helper.save_model(args.model_path,
                                    epoch,
                                    val_ppl,
                                    seq2seq,
                                    optimizer,
                                    args,
                                    states,
                                    best_model=True,
                                    clear_history=True)
            states['best_val_loss'] = val_ppl
            states['best_epoch'] = epoch

        # Saving standard model
        model_helper.save_model(args.model_path,
                                epoch,
                                val_ppl,
                                seq2seq,
                                optimizer,
                                args,
                                states,
                                best_model=False,
                                clear_history=True)