def __init__(self, corpus_data, *, params): self.fast_text = FastText(corpus_data.model).to(GPU) self.discriminator = Discriminator( params.emb_dim, n_layers=params.d_n_layers, n_units=params.d_n_units, drop_prob=params.d_drop_prob, drop_prob_input=params.d_drop_prob_input, leaky=params.d_leaky, batch_norm=params.d_bn).to(GPU) self.ft_optimizer = optim.SGD(self.fast_text.parameters(), lr=params.ft_lr) self.d_optimizer = optim.SGD(self.discriminator.parameters(), lr=params.d_lr, weight_decay=params.d_wd) self.a_optimizer = optim.SGD([{ "params": self.fast_text.u.parameters() }, { "params": self.fast_text.v.parameters() }], lr=params.a_lr) self.smooth = params.smooth self.loss_fn = nn.BCEWithLogitsLoss(reduction="elementwise_mean") self.corpus_data_queue = _data_queue(corpus_data, n_threads=params.n_threads, n_sentences=params.n_sentences, batch_size=params.ft_bs) self.vocab_size = params.vocab_size self.d_bs = params.d_bs self.split = params.split self.align_output = params.align_output
def fast_text_step(self): self.ft_optimizer.zero_grad() u_b, v_b = self.corpus_data_queue.__next__() s = self.fast_text(u_b, v_b) loss = FastText.loss_fn(s) loss.backward() self.ft_optimizer.step() return loss.item()
def fast_text_step(self): losses = [] for id in [0, 1]: self.ft_optimizer[id].zero_grad() u_b, v_b = self.corpus_data_queue[id].__next__() s = self.fast_text[id](u_b, v_b) loss = FastText.loss_fn(s) loss.backward() self.ft_optimizer[id].step() losses.append(loss.item()) return losses[0], losses[1]
def __init__(self, corpus_data_0, corpus_data_1, *, params, n_samples=10000000): self.fast_text = [FastText(corpus_data_0.model).to(GPU), FastText(corpus_data_1.model).to(GPU)] self.discriminator = Discriminator(params.emb_dim, n_layers=params.d_n_layers, n_units=params.d_n_units, drop_prob=params.d_drop_prob, drop_prob_input=params.d_drop_prob_input, leaky=params.d_leaky, batch_norm=params.d_bn).to(GPU) self.mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False) self.mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim))) self.mapping = self.mapping.to(GPU) self.ft_optimizer, self.ft_scheduler = [], [] for id in [0, 1]: optimizer, scheduler = optimizers.get_sgd_adapt(self.fast_text[id].parameters(), lr=params.ft_lr, mode="max", factor=params.ft_lr_decay, patience=params.ft_lr_patience) self.ft_optimizer.append(optimizer) self.ft_scheduler.append(scheduler) self.a_optimizer, self.a_scheduler = [], [] for id in [0, 1]: optimizer, scheduler = optimizers.get_sgd_adapt( [{"params": self.fast_text[id].u.parameters()}, {"params": self.fast_text[id].v.parameters()}], lr=params.a_lr, mode="max", factor=params.a_lr_decay, patience=params.a_lr_patience) self.a_optimizer.append(optimizer) self.a_scheduler.append(scheduler) if params.d_optimizer == "SGD": self.d_optimizer, self.d_scheduler = optimizers.get_sgd_adapt(self.discriminator.parameters(), lr=params.d_lr, mode="max", wd=params.d_wd) elif params.d_optimizer == "RMSProp": self.d_optimizer, self.d_scheduler = optimizers.get_rmsprop_linear(self.discriminator.parameters(), params.n_steps, lr=params.d_lr, wd=params.d_wd) else: raise Exception(f"Optimizer {params.d_optimizer} not found.") if params.m_optimizer == "SGD": self.m_optimizer, self.m_scheduler = optimizers.get_sgd_adapt(self.mapping.parameters(), lr=params.m_lr, mode="max", wd=params.m_wd, factor=params.m_lr_decay, patience=params.m_lr_patience) elif params.m_optimizer == "RMSProp": self.m_optimizer, self.m_scheduler = optimizers.get_rmsprop_linear(self.mapping.parameters(), params.n_steps, lr=params.m_lr, wd=params.m_wd) else: raise Exception(f"Optimizer {params.m_optimizer} not found") self.m_beta = params.m_beta self.smooth = params.smooth self.wgan = params.wgan self.d_clip_mode = params.d_clip_mode if params.wgan: self.loss_fn = _wasserstein_distance else: self.loss_fn = nn.BCEWithLogitsLoss(reduction="elementwise_mean") self.corpus_data_queue = [ _data_queue(corpus_data_0, n_threads=(params.n_threads + 1) // 2, n_sentences=params.n_sentences, batch_size=params.ft_bs), _data_queue(corpus_data_1, n_threads=(params.n_threads + 1) // 2, n_sentences=params.n_sentences, batch_size=params.ft_bs) ] self.sampler = [ WordSampler(corpus_data_0.dic, n_urns=n_samples, alpha=params.a_sample_factor, top=params.a_sample_top), WordSampler(corpus_data_1.dic, n_urns=n_samples, alpha=params.a_sample_factor, top=params.a_sample_top)] self.d_bs = params.d_bs self.dic_0, self.dic_1 = corpus_data_0.dic, corpus_data_1.dic self.d_gp = params.d_gp
def train(config): print('parameters: ') print(json.dumps(config, indent=4, ensure_ascii=False)) # load data print('load data .....') X, y = data_helper.process_data(config) # make vocab print('make vocab .....') word_to_index, label_to_index = data_helper.generate_vocab(X, y, config) # padding data print('padding data .....') input_x, input_y = data_helper.padding(X, y, config, word_to_index, label_to_index) # split data print('split data .....') x_train, y_train, x_test, y_test, x_dev, y_dev = data_helper.split_data(input_x, input_y, config) print('length train: {}'.format(len(x_train))) print('length test: {}'.format(len(x_test))) print('length dev: {}'.format(len(x_dev))) print('training .....') with tf.Graph().as_default(): sess_config = tf.ConfigProto( allow_soft_placement=config['allow_soft_placement'], log_device_placement=config['log_device_placement'] ) with tf.Session(config=sess_config) as sess: fast_text = FastText(config) # training procedure global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(config['learning_rate']) grads_and_vars = optimizer.compute_gradients(fast_text.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # keep track of gradient values and sparsity grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram('{}/grad/hist'.format(v.name), g) sparsity_summary = tf.summary.scalar('{}/grad/sparsity'.format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # output dir for models and summaries timestamp = str(int(time.time())) outdir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp)) print('writing to {}'.format(outdir)) # summary for loss and accuracy loss_summary = tf.summary.scalar('loss', fast_text.loss) acc_summary = tf.summary.scalar('accuracy', fast_text.accuracy) # train summary train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(outdir, 'summaries', 'train') train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # dev summary dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(outdir, 'summaries', 'dev') dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # checkpoint dirctory checkpoint_dir = os.path.abspath(os.path.join(outdir, 'checkpoints')) checkpoint_prefix = os.path.join(checkpoint_dir, 'model.bin') if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=config['num_checkpoints']) sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): feed_dict = { fast_text.input_x: x_batch, fast_text.input_y: y_batch, } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, fast_text.loss, fast_text.accuracy], feed_dict=feed_dict ) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): feed_dic = { fast_text.input_x: x_batch, fast_text.input_y: y_batch, fast_text.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, fast_text.loss, fast_text.accuracy], feed_dict=feed_dic ) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # generate batches batches = data_helper.generate_batchs(x_train, y_train, config) for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % config['evaluate_every'] == 0: print('Evaluation:') dev_step(x_dev, y_dev, writer=dev_summary_writer) if current_step % config['checkpoint_every'] == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print('save model checkpoint to {}'.format(path)) # test accuracy test_accuracy = sess.run([fast_text.accuracy], feed_dict={ fast_text.input_x: x_test, fast_text.input_y: y_test, fast_text.dropout_keep_prob: 1.0}) print('Test dataset accuracy: {}'.format(test_accuracy))
# Augmenting x_train and x_test with n-grams features x_train = add_ngram(x_train, token_indice, ngram_range) x_test = add_ngram(x_test, token_indice, ngram_range) print('Average train sequence length: {}'.format( np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format( np.mean(list(map(len, x_test)), dtype=int))) print('Pad sequences (samples x time)...') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Build model...') model = FastText(maxlen, max_features, embedding_dims) model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) print('Train...') early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[early_stopping], validation_data=(x_test, y_test)) print('Test...') result = model.predict(x_test)
class Trainer: def __init__(self, corpus_data, *, params): self.fast_text = FastText(corpus_data.model).to(GPU) self.discriminator = Discriminator( params.emb_dim, n_layers=params.d_n_layers, n_units=params.d_n_units, drop_prob=params.d_drop_prob, drop_prob_input=params.d_drop_prob_input, leaky=params.d_leaky, batch_norm=params.d_bn).to(GPU) self.ft_optimizer = optim.SGD(self.fast_text.parameters(), lr=params.ft_lr) self.d_optimizer = optim.SGD(self.discriminator.parameters(), lr=params.d_lr, weight_decay=params.d_wd) self.a_optimizer = optim.SGD([{ "params": self.fast_text.u.parameters() }, { "params": self.fast_text.v.parameters() }], lr=params.a_lr) self.smooth = params.smooth self.loss_fn = nn.BCEWithLogitsLoss(reduction="elementwise_mean") self.corpus_data_queue = _data_queue(corpus_data, n_threads=params.n_threads, n_sentences=params.n_sentences, batch_size=params.ft_bs) self.vocab_size = params.vocab_size self.d_bs = params.d_bs self.split = params.split self.align_output = params.align_output def fast_text_step(self): self.ft_optimizer.zero_grad() u_b, v_b = self.corpus_data_queue.__next__() s = self.fast_text(u_b, v_b) loss = FastText.loss_fn(s) loss.backward() self.ft_optimizer.step() return loss.item() def get_adv_batch(self, *, reverse, fix_embedding): vocab_split, bs_split = int(self.vocab_size * self.split), int( self.d_bs * self.split) x = (torch.randint(0, vocab_split, size=(bs_split, ), dtype=torch.long).tolist() + torch.randint(vocab_split, self.vocab_size, size=(self.d_bs - bs_split, ), dtype=torch.long).tolist()) if self.align_output: x = torch.LongTensor(x).view(self.d_bs, 1).to(GPU) if fix_embedding: with torch.no_grad(): x = self.fast_text.v(x).view(self.d_bs, -1) else: x = self.fast_text.v(x).view(self.d_bs, -1) else: x = self.fast_text.model.get_bag(x, self.fast_text.u.weight.device) if fix_embedding: with torch.no_grad(): x = self.fast_text.u(x[0], x[1]).view(self.d_bs, -1) else: x = self.fast_text.u(x[0], x[1]).view(self.d_bs, -1) y = torch.FloatTensor(self.d_bs).to(GPU).uniform_(0.0, self.smooth) if reverse: y[:bs_split] = 1 - y[:bs_split] else: y[bs_split:] = 1 - y[bs_split:] return x, y def discriminator_step(self): self.d_optimizer.zero_grad() self.discriminator.train() with torch.no_grad(): x, y = self.get_adv_batch(reverse=False, fix_embedding=True) y_hat = self.discriminator(x) loss = self.loss_fn(y_hat, y) loss.backward() self.d_optimizer.step() return loss.item() def adversarial_step(self): self.a_optimizer.zero_grad() self.discriminator.eval() x, y = self.get_adv_batch(reverse=True, fix_embedding=False) y_hat = self.discriminator(x) loss = self.loss_fn(y_hat, y) loss.backward() self.a_optimizer.step() return loss.item()