Пример #1
0
	def __init__(self, vocab, options):
		random.seed(1)
		self.model = dy.ParameterCollection()
		self.trainer = helpers.get_trainer(options, self.model)
		self.get_violation = helpers.update_method(options)
		
		word_count = vocab.word_freq
		word_vocab = vocab.wordlookup_tbl
		pos_vocab = vocab.poslookup_tbl
		rel_vocab = vocab.rellookup_tbl
		self.rels = rel_vocab

		self._enc = helpers.get_encoder(self.model, options, word_count, 
										word_vocab, pos_vocab)

		self._tree_enc = TreeEncoder.get_tree_encoder(self.model, 
												options, rel_vocab)


		self.mlp_rel_size = options.mlp_rel_size
		self.hidden_dim = options.compos_outdim
		W = orthonormal_initializer(self.mlp_rel_size, self.hidden_dim)
		self.mlp_dep_W = self.model.parameters_from_numpy(W)
		self.mlp_head_W = self.model.parameters_from_numpy(W)
		self.mlp_dep_b = self.model.add_parameters((self.mlp_rel_size,), init = dy.ConstInitializer(0.))
		self.mlp_head_b = self.model.add_parameters((self.mlp_rel_size,), init = dy.ConstInitializer(0.))
		
		# self.dropout_mlp = options.dropout_mlp
		self.rel_W = self.model.add_parameters((len(rel_vocab)*(self.mlp_rel_size +1) , self.mlp_rel_size + 1), init = dy.ConstInitializer(0.))

		self._train_flag = True
		self.oracle = options.oracle
		self.exploration_rate = options.exploration_rate
Пример #2
0
    def __init__(self, vocab, options):
        random.seed(1)
        self.model = dy.ParameterCollection()
        self.trainer = helpers.get_trainer(options, self.model)
        self.get_violation = helpers.update_method(options)

        word_count = vocab.word_freq
        word_vocab = vocab.wordlookup_tbl
        pos_vocab = vocab.poslookup_tbl
        rel_vocab = vocab.rellookup_tbl
        self.rels = rel_vocab
        self.__enc = helpers.get_encoder(self.model, options, word_count,
                                         word_vocab, pos_vocab)
        self.__scr = helpers.get_scorer(self.model, options, rel_vocab)
        self.__tree_enc = TreeEncoder.get_tree_encoder(self.model, options,
                                                       rel_vocab)
        self.__train_flag = True
        self.oracle = options.oracle
        self.exploration_rate = options.exploration_rate
Пример #3
0
def train(options):
    log = helpers.Logger(options.verbose)
    timer = helpers.Timer()
    # Load data =========================================================
    log.info('Reading corpora')
    # Read vocabs
    vocab = helpers.get_dictionaries(options)
    src_dic, trg_dic = vocab['src'], vocab['trg']
    # Read training
    train_src_data = src_dic.read_corpus(options.train_src)
    train_trg_data = trg_dic.read_corpus(options.train_dst)

    max_src_len, max_trg_len = options.max_src_len, options.max_trg_len
    if max_src_len > 0 or max_trg_len > 0:
        train_src_data, train_trg_data = corpus_filter(train_src_data,
                                                       train_trg_data,
                                                       max_src_len,
                                                       max_trg_len)
        assert len(train_src_data) == len(
            train_trg_data
        ), 'Size of source corpus and the target corpus must be the same!!'
    # Read validation
    valid_src_data = src_dic.read_corpus(options.valid_src)
    valid_trg_data = trg_dic.read_corpus(options.valid_dst)
    # Validation output
    if not options.valid_out:
        options.valid_out = helpers.exp_filename(options, 'valid.out')
    # Get target language model
    lang_model = helpers.get_language_model(options, train_trg_data,
                                            trg_dic.size())
    # Create model ======================================================
    log.info('Creating model')
    s2s = helpers.build_model(options, vocab, lang_model)

    # Trainer ==========================================================
    trainer = helpers.get_trainer(options, s2s)
    log.info('Using ' + options.trainer + ' optimizer')
    # Print configuration ===============================================
    if options.verbose:
        Opts.print_config(options,
                          src_dict_size=src_dic.size(),
                          trg_dict_size=trg_dic.size())
    # Creat batch loaders ===============================================
    log.info('Creating batch loaders')
    trainbatchloader = BatchLoader(train_src_data, train_trg_data,
                                   options.batch_size)
    devbatchloader = BatchLoader(valid_src_data, valid_trg_data,
                                 options.dev_batch_size)
    # Start training ====================================================
    log.info('starting training')
    timer.restart()
    train_loss = 0.
    processed = 0
    best_bleu = -1
    bleu = -1
    deadline = 0
    i = 0
    for epoch in xrange(options.num_epochs):
        for x, y in trainbatchloader:
            s2s.set_train_flag()
            processed += sum(map(len, y))
            bsize = len(y)
            # Compute loss
            loss = s2s.calculate_loss(x, y)
            # Backward pass and parameter update
            train_loss += loss.scalar_value() * bsize

            loss.backward()
            trainer.update()

            if (i + 1) % options.check_train_error_every == 0:
                # Check average training error from time to time
                logloss = train_loss / processed
                ppl = np.exp(logloss)
                trainer.status()
                log.info(
                    " Training_loss=%f, ppl=%f, time=%f s, tokens processed=%d"
                    % (logloss, ppl, timer.tick(), processed))
                train_loss = 0
                processed = 0

            if (i + 1) % options.check_valid_error_every == 0:
                # Check generalization error on the validation set from time to time
                s2s.set_test_flag()
                dev_loss = 0
                dev_processed = 0
                timer.restart()
                for x, y in devbatchloader:
                    dev_processed += sum(map(len, y))
                    bsize = len(y)
                    loss = s2s.calculate_loss(x, y)
                    dev_loss += loss.scalar_value() * bsize
                dev_logloss = dev_loss / dev_processed
                dev_ppl = np.exp(dev_logloss)
                log.info(
                    "[epoch %d] Dev loss=%f, ppl=%f, time=%f s, tokens processed=%d"
                    %
                    (epoch, dev_logloss, dev_ppl, timer.tick(), dev_processed))

            if (i + 1) % options.valid_bleu_every == 0:
                # Check BLEU score on the validation set from time to time
                s2s.set_test_flag()
                log.info('Start translating validation set, buckle up!')
                timer.restart()
                with open(options.valid_out, 'w+') as fp:
                    for x in valid_src_data:
                        y_hat = s2s.translate(x, beam_size=options.beam_size)
                        translation = [
                            trg_dic.get_word(w) for w in y_hat[1:-1]
                        ]
                        fp.write(' '.join(translation))
                        fp.write('\n')
                bleu, details = evaluation.bleu_score(options.valid_dst,
                                                      options.valid_out)
                log.info('Finished translating validation set %.2f elapsed.' %
                         timer.tick())
                log.info(details)
                # Early stopping : save the latest best model
                if bleu > best_bleu:
                    best_bleu = bleu
                    log.info('Best BLEU score up to date, saving model to %s' %
                             options.model)
                    s2s.save(options.model)
                    deadline = 0
                else:
                    deadline += 1
                if options.patience > 0 and deadline > options.patience:
                    log.info('No improvement since %d epochs, early stopping '
                             'with best validation BLEU score: %.3f' %
                             (deadline, best_bleu))
                    sys.exit()
        #	i += 1
        # trainer.update()

    #if bleu > best_bleu:
    #	s2s.save(options.model)
    s2s.save(options.model)
Пример #4
0
def eval_user_adaptation(opt):
    log = utils.Logger(opt.verbose)
    timer = utils.Timer()
    # Read vocabs
    lexicon = helpers.get_lexicon(opt)
    # Read data
    filepairs = load_user_filepairs(opt.usr_file_list)
    # Get target language model
    lang_model = None
    # Load model
    s2s = helpers.build_model(opt, lexicon, lang_model, test=True)
    if opt.update_mode == 'mixture_weights' and not opt.user_recognizer == 'fact_voc':
        log.info('Updating only the mixture weights doesn\'t make sense here')
        exit()
    s2s.lm = lexicon.trg_unigrams
    #    s2s.freeze_parameters()
    # Trainer
    trainer = helpers.get_trainer(opt, s2s)
    # print config
    if opt.verbose:
        options.print_config(opt,
                             src_dict_size=len(lexicon.w2ids),
                             trg_dict_size=len(lexicon.w2idt))
    # This will store translations and gold sentences
    base_translations = []
    adapt_translations = []
    gold = []
    # Run training
    for usr_id, (src_file, trg_file) in enumerate(filepairs):
        log.info('Evaluating on files %s' %
                 os.path.basename(src_file).split()[0])
        # Load file pair
        src_data = data.read_corpus(src_file, lexicon.w2ids, raw=True)
        trg_data = data.read_corpus(trg_file, lexicon.w2idt, raw=True)
        # split train/test
        train_src, test_src, train_trg, test_trg, order = split_user_data(
            src_data, trg_data, n_test=opt.n_test)
        # Convert train data to indices
        train_src = lexicon.sents_to_ids(train_src)
        train_trg = lexicon.sents_to_ids(train_trg, trg=True)
        # Save test data
        for s in test_trg:
            gold.append(' '.join(s))
        # Reset model
        s2s.load()
        s2s.reset_usr_vec()
        # Translate with baseline model
        base_translations.extend(evaluate_model(s2s, test_src, opt.beam_size))
        # Start loop
        n_train = opt.max_n_train
        adapt_translations.extend(
            adapt_user(s2s, trainer, train_src[:n_train], train_trg[:n_train],
                       test_src, opt))

    # Temp files
    temp_gold = utils.exp_temp_filename(opt, 'gold.txt')
    temp_base = utils.exp_temp_filename(opt, '%s_base.txt' % opt.update_mode)
    temp_adapt = utils.exp_temp_filename(opt, '%s_adapt.txt' % opt.update_mode)
    utils.savetxt(temp_gold, gold)
    utils.savetxt(temp_base, base_translations)
    utils.savetxt(temp_adapt, adapt_translations)
    # Evaluate base translations
    bleu, details = evaluation.bleu_score(temp_gold, temp_base)
    log.info('Base BLEU score: %.2f' % bleu)
    # Evaluate base translations
    bleu, details = evaluation.bleu_score(temp_gold, temp_adapt)
    log.info('Adaptation BLEU score: %.2f' % bleu)
    # Compare both
    temp_bootstrap_gold = utils.exp_temp_filename(opt, 'bootstrap_gold.txt')
    temp_bootstrap_base = utils.exp_temp_filename(opt, 'bootstrap_base.txt')
    temp_bootstrap_adapt = utils.exp_temp_filename(opt, 'bootstrap_adapt.txt')
    bleus = evaluation.paired_bootstrap_resampling(
        temp_gold, temp_base, temp_adapt, opt.bootstrap_num_samples,
        opt.bootstrap_sample_size, temp_bootstrap_gold, temp_bootstrap_base,
        temp_bootstrap_adapt)
    evaluation.print_paired_stats(bleus)
    os.remove(temp_bootstrap_gold)
    os.remove(temp_bootstrap_base)
    os.remove(temp_bootstrap_adapt)
Пример #5
0
def train(opt):
    log = helpers.Logger(opt.verbose)
    timer = helpers.Timer()
    # Load data =========================================================
    log.info('Reading corpora')
    # Read vocabs
    widss, ids2ws, widst, ids2wt = helpers.get_dictionaries(opt)

    # Read training
    trainings_data = data.read_corpus(opt.train_src, widss)
    trainingt_data = data.read_corpus(opt.train_dst, widst)
    # Read validation
    valids_data = data.read_corpus(opt.valid_src, widss)
    validt_data = data.read_corpus(opt.valid_dst, widst)
    # Validation output
    if not opt.valid_out:
        opt.valid_out = helpers.exp_filename(opt, 'valid.out')
    # Get target language model
    lang_model = helpers.get_language_model(opt, trainingt_data, widst)
    # Create model ======================================================
    log.info('Creating model')
    s2s = helpers.build_model(opt, widss, widst, lang_model)

    # Trainer ==========================================================
    trainer = helpers.get_trainer(opt, s2s)
    log.info('Using ' + opt.trainer + ' optimizer')
    # Print configuration ===============================================
    if opt.verbose:
        options.print_config(opt,
                             src_dict_size=len(widss),
                             trg_dict_size=len(widst))
    # Creat batch loaders ===============================================
    log.info('Creating batch loaders')
    trainbatchloader = data.BatchLoader(trainings_data, trainingt_data,
                                        opt.batch_size)
    devbatchloader = data.BatchLoader(valids_data, validt_data,
                                      opt.dev_batch_size)
    # Start training ====================================================
    log.info('starting training')
    timer.restart()
    train_loss = 0
    processed = 0
    best_bleu = -1
    deadline = 0
    i = 0
    for epoch in range(opt.num_epochs):
        for x, y in trainbatchloader:
            s2s.set_train_mode()
            processed += sum(map(len, y))
            bsize = len(y)
            # Compute loss
            loss = s2s.calculate_loss(x, y)
            # Backward pass and parameter update
            loss.backward()
            trainer.update()
            train_loss += loss.scalar_value() * bsize
            if (i + 1) % opt.check_train_error_every == 0:
                # Check average training error from time to time
                logloss = train_loss / processed
                ppl = np.exp(logloss)
                trainer.status()
                log.info(
                    " Training_loss=%f, ppl=%f, time=%f s, tokens processed=%d"
                    % (logloss, ppl, timer.tick(), processed))
                train_loss = 0
                processed = 0
            if (i + 1) % opt.check_valid_error_every == 0:
                # Check generalization error on the validation set from time to time
                s2s.set_test_mode()
                dev_loss = 0
                dev_processed = 0
                timer.restart()
                for x, y in devbatchloader:
                    dev_processed += sum(map(len, y))
                    bsize = len(y)
                    loss = s2s.calculate_loss(x, y, test=True)
                    dev_loss += loss.scalar_value() * bsize
                dev_logloss = dev_loss / dev_processed
                dev_ppl = np.exp(dev_logloss)
                log.info(
                    "[epoch %d] Dev loss=%f, ppl=%f, time=%f s, tokens processed=%d"
                    %
                    (epoch, dev_logloss, dev_ppl, timer.tick(), dev_processed))

            if (i + 1) % opt.valid_bleu_every == 0:
                # Check BLEU score on the validation set from time to time
                s2s.set_test_mode()
                log.info('Start translating validation set, buckle up!')
                timer.restart()
                with open(opt.valid_out, 'w+') as f:
                    for x in valids_data:
                        y_hat = s2s.translate(x, beam_size=opt.beam_size)
                        translation = [ids2wt[w] for w in y_hat[1:-1]]
                        print(' '.join(translation), file=f)
                bleu, details = evaluation.bleu_score(opt.valid_dst,
                                                      opt.valid_out)
                log.info('Finished translating validation set %.2f elapsed.' %
                         timer.tick())
                log.info(details)
                # Early stopping : save the latest best model
                if bleu > best_bleu:
                    best_bleu = bleu
                    log.info('Best BLEU score up to date, saving model to %s' %
                             s2s.model_file)
                    s2s.save()
                    deadline = 0
                else:
                    deadline += 1
                if opt.patience > 0 and deadline > opt.patience:
                    log.info('No improvement since %d epochs, early stopping '
                             'with best validation BLEU score: %.3f' %
                             (deadline, best_bleu))
                    exit()
            i = i + 1
        trainer.update_epoch()
def eval_user_adaptation(opt):
    log = utils.Logger(opt.verbose)
    timer = utils.Timer()
    # Read vocabs
    lexicon = helpers.get_lexicon(opt)
    # Read data
    filepairs = load_user_filepairs(opt.usr_file_list)
    # Get target language model
    lang_model = None
    # Load model
    s2s = helpers.build_model(opt, lexicon, lang_model, test=True)
    #if not opt.full_training:
    #    s2s.freeze_parameters()
    # Trainer
    trainer = helpers.get_trainer(opt, s2s)
    # print config
    if opt.verbose:
        options.print_config(opt,
                             src_dict_size=len(lexicon.w2ids),
                             trg_dict_size=len(lexicon.w2idt))
    # This will store translations and gold sentences
    translations = dict([(i, [])
                         for i in range(opt.min_n_train, opt.max_n_train)])
    gold = []
    # Run training
    for usr_id, (src_file, trg_file) in enumerate(filepairs):
        log.info('Evaluating on files %s' %
                 os.path.basename(src_file).split()[0])
        # Load file pair
        src_data = data.read_corpus(src_file, lexicon.w2ids, raw=True)
        trg_data = data.read_corpus(trg_file, lexicon.w2idt, raw=True)
        # split train/test
        train_src, test_src, train_trg, test_trg, order = split_user_data(
            src_data, trg_data, n_test=opt.n_test)
        # Convert train data to indices
        train_src = lexicon.sents_to_ids(train_src)
        train_trg = lexicon.sents_to_ids(train_trg, trg=True)
        # Save test data
        for s in test_trg:
            gold.append(' '.join(s))
        # Start loop
        for n_train in range(opt.min_n_train, opt.max_n_train):
            log.info('Training on %d sentence pairs' % n_train)
            # Train on n_train first sentences
            X, Y = train_src[:n_train], train_trg[:n_train]
            temp_out = utils.exp_temp_filename(opt, str(n_train) + 'out.txt')
            if opt.full_training:
                s2s.load()
            if opt.log_unigram_bias:
                if opt.use_trg_unigrams:
                    unigrams = lexicon.compute_unigrams(Y, lang='trg')
                else:
                    unigrams = lexicon.estimate_unigrams(X)
                log_unigrams = np.log(unigrams + opt.log_unigrams_eps)
                s2s.reset_usr_vec(log_unigrams)
            elif n_train > 0:
                adapt(s2s, trainer, X, Y, opt.num_epochs,
                      opt.check_train_error_every)
            log.info('Translating test file')
            s2s.set_test_mode()
            # Test on test split
            for x in test_src:
                y_hat = s2s.translate(x, 0, beam_size=opt.beam_size)
                translations[n_train].append(y_hat)

    # Temp files
    temp_gold = utils.exp_temp_filename(opt, 'gold.txt')
    np.savetxt(temp_gold, gold, fmt='%s')
    # Results
    test_bleus = np.zeros(opt.max_n_train - opt.min_n_train)
    for n_train in range(opt.min_n_train, opt.max_n_train):
        log.info('Evaluation for %d sentence pairs' % n_train)
        temp_out = utils.exp_temp_filename(opt, str(n_train) + 'out.txt')
        temp_bootstrap_out = utils.exp_temp_filename(
            opt,
            str(n_train) + '_bootstrap_out.txt')
        temp_bootstrap_ref = utils.exp_temp_filename(
            opt,
            str(n_train) + '_bootstrap_ref.txt')
        np.savetxt(temp_out, translations[n_train], fmt='%s')
        bleu, details = evaluation.bleu_score(temp_gold, temp_out)
        log.info('BLEU score: %.2f' % bleu)
        bleus = evaluation.bootstrap_resampling(temp_gold, temp_out,
                                                opt.bootstrap_num_samples,
                                                opt.bootstrap_sample_size,
                                                temp_bootstrap_ref,
                                                temp_bootstrap_out)
        evaluation.print_stats(bleus)
        test_bleus[n_train - opt.min_n_train] = bleu
    np.savetxt(utils.exp_filename(opt, 'bleu_scores.txt'),
               test_bleus,
               fmt='%.3f')