def __init__(self, model=model.Transformer(), lenpen=0): if isinstance(model, torch.nn.Module): self.model = model elif isinstance(model, list) and all( isinstance(m, torch.nn.Module) for m in model): self.model = self.average_models(model) else: raise ValueError( 'Search object requires nn.Module or list of nn.Module.') self.lenpen = lenpen
def model_fn(features, mode): xs = [features["in_ids"], features["length"]] m = model.Transformer() if mode == tf.estimator.ModeKeys.PREDICT: predict = m.predict(xs) prediction = {"y_hat": predict} output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=prediction) return output_spec ys = [features["out_ids"], features["y"]] loss, train_op = m.get_loss_train_op(xs, ys) train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return train_spec
def __init__(self, src_lang: str = 'en', trg_lang: str = 'de', vocab_size: int = 37000, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, num_decoder_layers: int = 8, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = 'relu', warmup: int = 4000, bpe_file: str = '../data/wmt14.en-de/share.bpe.37000', lenpen: float = 0.6, beam_size: int = 4, ckpt_steps: int = 1500): super().__init__() self.src_lang = src_lang self.trg_lang = trg_lang self.vocab_size = vocab_size self.d_model = d_model self.nhead = nhead self.num_encoder_layers = num_encoder_layers self.num_decoder_layers = num_decoder_layers self.dim_feedforward = dim_feedforward self.dropout = dropout self.activation = activation self.warmup = warmup self.bpe_file = bpe_file self.lenpen = lenpen self.beam_size = beam_size self.ckpt_steps = ckpt_steps self.model = model.Transformer( vocab_size=self.vocab_size, d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.num_encoder_layers, num_decoder_layers=self.num_decoder_layers, dim_feedforward=self.dim_feedforward, dropout=self.dropout, activation=self.activation, ) self.train_acc = pl.metrics.Accuracy() self.val_acc = pl.metrics.Accuracy() self.search = search.Search(self.model, lenpen=self.lenpen)
def inference(self): """ forward propagation :return: labels for each sample """ v = tf.Variable(tf.truncated_normal(shape=[self.p, self.k], mean=0, stddev=0.01), dtype='float32') # three-hidden-layer neural network, network shape of (200-200-200) with tf.variable_scope('DNN', reuse=False): # embedding layer y_embedding_input = tf.gather(v, self.feature_inds) Model = models.Transformer(hparams.Hparams().parser.parse_args()) result = Model.result(y_embedding_input) # first hidden layer # add FM output and DNN output self.y_out = result self.y_out_prob = tf.sigmoid(self.y_out)
val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### from splitcross import SplitCrossEntropyLoss criterion = None ntokens = len(corpus.dictionary) print('Total number of tokens:', ntokens) #model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.BoomRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) model = model.Transformer(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.AttnRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.RecAttn(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.LNRNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.LNRR(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### if args.resume: print('Resuming model ...') model_load(args.resume) #optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute #if args.wdrop: # from weight_drop import WeightDrop # for rnn in model.rnns: # if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
if cl_args.debug: pdb.set_trace() if cl_args.epochs < 1: print('Invalid epochs: ', cl_args.epochs) exit() # Get the datasets dataset = data.Data(cl_args.lang_code, cl_args.reverse) train_dataset, test_dataset, val_dataset = dataset.get_dataset(cl_args.short_test) # Transformer network transformer = model.Transformer(param.NUM_LAYERS, param.D_MODEL, param.NUM_HEADS, param.DFF, input_vocab_size = dataset.inp_vocab_size, target_vocab_size = dataset.tar_vocab_size, pe_input = param.PAD_SIZE, pe_target = param.PAD_SIZE, rate=param.DROPOUT ) train_loss = tf.metrics.Mean(name='train_loss') optimizer = utils.optimizer # The @tf.function trace-compiles train_step into a TF graph for faster # execution. The function specializes to the precise shape of the argument # tensors. To avoid re-tracing due to the variable sequence lengths or variable # batch sizes (the last batch is smaller), use input_signature to specify # more generic shapes. train_step_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.float32),
"WARNING: Using the moviecorpus requires n_embd_d = 2, but is set to {}. It was automatically set to " "2! Please check your configuration!".format(params.clf_pipes)) params.n_embd_d = 2 if params.n_acc_batch > 1 and params.n_batch_train % params.n_acc_batch != 0: raise ValueError( "Gradient accumulation active, due to n_acc_batch = {}. n_batch_train is {} which is not " "divisible through n_acc_batch without rest, but must be!") elif params.n_acc_batch > 1: params.n_batch_train = int(params.n_batch_train / params.n_acc_batch) params.gradient_accumulation = True else: params.gradient_accumulation = False # --- generate model as tensorflow graph (train) ------------------------------------------------------------------ print("Generating model ...") transformer_decoder = model.Transformer(params=params, use_encoder=params.use_encoder) if params.use_encoder is False: # original decoder model X_train = tf.placeholder( tf.int32, [None, params.clf_pipes, params.n_ctx, params.n_embd_d + 1]) else: # with encoder-decoder model X_train = tf.placeholder( tf.int32, [None, 2, params.clf_pipes, params.n_ctx, params.n_embd_d + 1]) M_train = tf.placeholder(tf.float32, [None, params.clf_pipes, params.n_ctx]) Y_train = tf.placeholder(tf.int32, [None]) """ This just defines and adds the node, not perform actual training. Training is performed in the train loop below - returns the result from all four (two) gpus after training and loss calculated (gradient descent also performed)
conv_width=7, max_relative_distance=24, ) architecture_specs = dict( num_subwords=8000, num_speakers=2, d_model=512, num_decoder_layers=7, num_encoder_layers=4, num_highway_layers=2, highway_dropout=0.1, embedding_dropout=0.1, ) tf.keras.backend.clear_session() chatbot_model = model.Transformer( **architecture_specs, transformer_layer_kwargs=transformer_specs) #%% test_data_sample = next(iter(test_pipeline)) chatbot_model(test_data_sample[0]) word2v_embeddings = np.load('./model_components/w2v_embeddings.npy') chatbot_model.embedding_layer.set_weights([word2v_embeddings]) chatbot_model.summary() chatbot = chatbot_estimator.ChatBotTrainer(subword_processor, chatbot_model, model.TransformerOptimizer( 0.001,
def main(): checkpoint_path = "./checkpoints{}/train".format(args.ckpt) transformer = model.Transformer(args.num_enc, args.num_dec, args.d_model, args.num_heads, args.dff, args.max_sequence_length, rate=args.dropout_rate) ckpt = tf.train.Checkpoint(transformer=transformer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) #this_model = ckpt.restore(ckpt_manager.latest_checkpoint) # source magniutde of the STFT X_spec = np.load(args.enc_inp) # (batch, d_model, seq_len) X_spec = np.transpose(X_spec, (0, 2, 1)) # (batch, seq_len, d_model) X_spec = X_spec[:, :, :-1] # (batch, seq_len, d_model - 1) # real magniutde of the STFT. This is from target speaker. Only using for listening to compare target and predict. Y_spec = np.load(args.tar_inp) Y_spec = Y_spec[:, :-1, 1:] # (batch, d_model, seq) # test phase of the STFT X_phase = np.load(args.enc_phase_inp) X_phase = X_phase[:, :-1, :] # (batch, d_model, seq) # real phase of the STFT. This is from target speaker. Only using for listening to compare target and predict. Y_phase = np.load(args.tar_phase_inp) Y_phase = Y_phase[:, :-1, 1:] # (batch, d_model, seq) name = 'ckpt={}'.format(args.ckpt) save_dir = './result/' for i in range(len(X_spec)): inp_spec = X_spec[i] # max_seq_len, d_model inp_pha = X_phase[i] # d_model, seq_len predict_spec, attention_weights = evaluate(inp_spec, transformer) print("after predict, spec shape {}".format(np.shape(predict_spec))) # for make attention alignment map spec_t = inp_spec.T # d_model, seq_len idx_spec = np.argwhere(np.diff(np.r_[False, spec_t[0], False])) find_zero_spec = np.squeeze(idx_spec) zero_cnt = find_zero_spec[-1] for x in range(6): plot = 'decoder_layer{}_block2'.format(x + 1) plot_attention_weights(attention_weights, plot, i + 1, zero_cnt) # spec plot predict_spec = predict_spec[1:, :] # (seq_len, d_model) predict_spec = np.transpose(predict_spec, (1, 0)) # (d_model, seq_len) # y_hat wav, fig save concat = predict_spec * inp_pha save_name = name + '_{}th'.format(i) for_save = os.path.join(save_dir, name) if not os.path.exists(for_save): os.makedirs(for_save) recover(concat, for_save, save_name) np_save_dir = 'np_file' np_dir = os.path.join(for_save, np_save_dir) if not os.path.exists(np_dir): os.makedirs(np_dir) # y_hat np file save save_np = '{}th_predict.result'.format(i) np_final_predict = os.path.join(np_dir, save_np) np.save(np_final_predict, concat) ########### check #######, x_real plot # x_real np file save x_real = inp_spec.T * X_phase[i] save_np_x_real = '{}th_x_real.result'.format(i) np_final_x_real = os.path.join(np_dir, save_np_x_real) np.save(np_final_x_real, x_real) # x_real wav, fig file save save_name_real = 'x_real_' + name + '_{}th'.format(i) for_save_real = os.path.join(save_dir, name) if not os.path.exists(for_save_real): os.makedirs(for_save_real) # np.save(for_save_real, real) recover(x_real, for_save_real, save_name_real) # y_real np file save y_real = Y_spec[i] * Y_phase[i] save_np_y_real = '{}th_y_real.result'.format(i) np_final_y_real = os.path.join(np_dir, save_np_y_real) np.save(np_final_y_real, y_real) # y_real wav, fig file save save_name_real = 'y_real_' + name + '_{}th'.format(i) for_save_real = os.path.join(save_dir, name) if not os.path.exists(for_save_real): os.makedirs(for_save_real) recover(y_real, for_save_real, save_name_real)
decoder = model.Decoder(emb_size=args.emb_size, hid_size=args.hid_size, vocab_size=len(i2w), num_layers=args.num_layers, use_attn=args.use_attn) model = model.Seq2Seq(encoder=encoder, decoder=decoder, i2w=i2w, use_knowledge=args.use_knowledge, args=args, test=True).cuda() elif args.transformer: model = model.Transformer(i2w=i2w, use_knowledge=args.use_knowledge, args=args, test=True).cuda() # TEST EVALUATION best_epoch = args.epoch model.load("{0}/model_{1}.bin".format(args.save_path, best_epoch)) model.transformer.eval() # Iterate over batches num_batches = math.ceil(len(valid_freq) / args.batch_size) cum_loss = 0 cum_words = 0 predicted_sentences = [] indices = list(range(len(valid_freq))) for batch in tqdm(range(num_batches)): # Prepare batch
def main(): # load dataset here spec_enc_inp = np.load(args.enc_inp) spec_enc_inp = spec_enc_inp.astype('float32') spec_dec_inp = np.load(args.dec_inp) spec_dec_inp = spec_dec_inp.astype('float32') spec_tar_inp = np.load(args.tar_inp) spec_tar_inp = spec_tar_inp.astype('float32') spec_enc_inp = spec_enc_inp[:, :-1, :] # batch, d_model, seq_len spec_dec_inp = spec_dec_inp[:, :-1, :] # batch, d_model, seq_len spec_tar_inp = spec_tar_inp[:, :-1, :] # batch, d_model, seq_len enc_inp_spec = np.transpose(spec_enc_inp, (0, 2, 1)) # batch, seq_len, d_model dec_inp_spec = np.transpose(spec_dec_inp, (0, 2, 1)) # batch, seq_len, d_model tar_inp_spec = np.transpose(spec_tar_inp, (0, 2, 1)) # batch, seq_len, d_model print("enc_inp_spec shape {} dec_inp_spec shape {} tar_inp_spec shape {}".format(np.shape(enc_inp_spec), np.shape(dec_inp_spec), np.shape(tar_inp_spec))) ckpt_path = args.ckpt batch_size = args.batch_size buffer_size = 80 EPOCHS = args.epochs train_dataset = input_fn(enc_inp_spec, dec_inp_spec, tar_inp_spec, batch_size, buffer_size) train_loss = tf.keras.metrics.Mean(name='train_loss') transformer = model.Transformer(args.num_enc, args.num_dec, args.d_model, args.num_heads, args.dff, args.max_sequence_length, rate=args.dropout_rate) if args.lr == 0: lr_schedule = model.CustomSchedule(args.d_model) print("lr is {}. Using schedule sampling learning rate".format(args.lr)) else: initial_learning_rate = args.lr lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=4000, decay_rate=0.96, staircase=True) print("lr is not schedule! We use {}".format(args.lr)) optimizer = tf.keras.optimizers.Adam(lr_schedule, beta_1=0.9, beta_2=0.98, epsilon=1e-9) checkpoint_path = "./checkpoints{}/train".format(args.ckpt) ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=None) # writer = tf.summary.create_file_writer("/tmp/mylogs/eager") logdir = "logs/scalars{}/".format(args.ckpt) + datetime.now().strftime("%Y%m%d-%H%M%S") file_writer = tf.summary.create_file_writer(logdir + "/metrics") file_writer.set_as_default() if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!!') train_step_signature = [ tf.TensorSpec(shape=(None, None, None), dtype=tf.float32), tf.TensorSpec(shape=(None, None, None), dtype=tf.float32), tf.TensorSpec(shape=(None, None, None), dtype=tf.float32), ] @tf.function(input_signature=train_step_signature) def train_step(inp_spec, dec_spec, tar_spec): enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp_spec, dec_spec) with tf.GradientTape() as tape: predict_spec, attention_weight = transformer(inp_spec, dec_spec, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_spec, predict_spec) gradient = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradient, transformer.trainable_variables)) train_loss(loss) return predict_spec, attention_weight train_begin = 0 train_elapsed = 0 train_start = train_begin = time.time() for epoch in range(EPOCHS): start = epoch_begin = time.time() train_loss.reset_states() # inp -> man, tar -> woman for (batch, (inp_spec, dec_spec, tar_spec)) in enumerate(train_dataset): name_before = 'before_predict_epoch={}'.format(int(epoch)) result_before = inp_spec[0] result_before = np.transpose(result_before, (1, 0)) result, attention_weight = train_step(inp_spec, dec_spec, tar_spec) if batch % 20 == 0: current = time.time() elapsed = current - start #epoch_elapsed = (current - epoch_begin) / 60.0 #train_elapsed = (current - train_begin) / 3600.0 train_elapsed = (current - train_start) / 3600.0 #print('Epoch {} Batch {} Loss {:.4f}, train elapsed: {:.2f}h '.format(epoch + 1, batch, train_loss.result(), train_elapsed)) if (epoch + 1) % 20 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result())) tf.summary.scalar('loss', data=train_loss.result(), step=epoch) print('Time taken for 1 epoch: {} secs, elapsed: {:.2f}h \n'.format(time.time() - start, train_elapsed)) if epoch % 20 == 0: spec_t = inp_spec[0] spec_t = spec_t.numpy() spec_t = spec_t.T spec_tar = dec_spec[0] spec_tar = spec_tar.numpy() spec_tar_t = spec_tar.T idx_spec = np.argwhere(np.diff(np.r_[False, spec_t[0], False])) idx_tar = np.argwhere(np.diff(np.r_[False, spec_tar_t[0], False])) find_zero_spec = np.squeeze(idx_spec) find_zero_spec_tar = np.squeeze(idx_tar) zero_cnt = find_zero_spec[-1] zero_cnt_tar = find_zero_spec_tar[-1] for x in range(6): plot = 'decoder_layer{}_block2'.format(x + 1) plot_attention_weights(attention_weight, plot, epoch, zero_cnt, zero_cnt_tar) # spec plot attention weights if epoch % 5 == 0: # Get results from trainset every 5 epochs epc = int(epoch) name_after = 'after_predict_epoch={}'.format(epc) result_after = result[0] result_after = np.transpose(result_after, (1, 0)) # The dataset before training (original input) plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.amplitude_to_db(result_before, ref=np.max), y_axis='hz', x_axis='time', sr=16000, hop_length=args.hop) plt.title(name_before) plt.colorbar(format='%+2.0f dB') plt.tight_layout() fig_save_dir = './result/' + ckpt_path + '_fig/' if not os.path.exists(fig_save_dir): os.makedirs(fig_save_dir) plt.savefig(fig_save_dir + name_before + '.png') plt.cla() plt.close() make_wav = librosa.istft(result_before, hop_length=args.hop) wav_save_dir = './result/' + ckpt_path + '_wav/' if not os.path.exists(wav_save_dir): os.makedirs(wav_save_dir) sf.write(wav_save_dir + name_before + '.wav', make_wav, 16000, format='WAV', endian='LITTLE', subtype='PCM_16') # Results after training from trainset (y_hat) plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.amplitude_to_db(result_after, ref=np.max), y_axis='hz', x_axis='time', sr=16000, hop_length=args.hop) plt.title(name_after) plt.colorbar(format='%+2.0f dB') plt.tight_layout() plt.savefig(fig_save_dir + name_after + '.png') plt.cla() plt.close() make_wav = librosa.istft(result_after, hop_length=args.hop) sf.write(wav_save_dir + name_after + '.wav', make_wav, 16000, format='WAV', endian='LITTLE', subtype='PCM_16') # Real input (source) save_tar = tar_spec[0] save_tar = np.transpose(save_tar, (1, 0)) plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.amplitude_to_db(save_tar, ref=np.max), y_axis='hz', x_axis='time', sr=16000, hop_length=args.hop) real_name = 'real_epoch={}'.format(int(epoch)) plt.title(real_name) plt.colorbar(format='%+2.0f dB') plt.tight_layout() fig_save_dir = './result/' + ckpt_path + '_fig/' if not os.path.exists(fig_save_dir): os.makedirs(fig_save_dir) plt.savefig(fig_save_dir + real_name + '.png') plt.cla() plt.close() make_wav = librosa.istft(save_tar, hop_length=args.hop) wav_save_dir = './result/' + ckpt_path + '_wav/' if not os.path.exists(wav_save_dir): os.makedirs(wav_save_dir) sf.write(wav_save_dir + real_name + '.wav', make_wav, 16000, format='WAV', endian='LITTLE', subtype='PCM_16') # Numpy file before training np_save_dir = './result/' + ckpt_path + '_np_file/' if not os.path.exists(np_save_dir): os.makedirs(np_save_dir) np.save(np_save_dir + name_before, result_before) # Numpy file after training trainset np_save_dir = './result/' + ckpt_path + '_np_file/' if not os.path.exists(np_save_dir): os.makedirs(np_save_dir) np.save(np_save_dir + name_after, result_after) # Real Numpy file (source) np_save_dir = './result/' + ckpt_path + '_np_file/' if not os.path.exists(np_save_dir): os.makedirs(np_save_dir) real_name = 'y_real_epoch={}'.format(epc) np.save(np_save_dir + real_name, save_tar)
config = config.Config() train_file = '../data/ag_news.train' if len(sys.argv) > 2: train_file = sys.argv[1] test_file = '../data/ag_news.test' if len(sys.argv) > 3: test_file = sys.argv[2] dataset = utils.Dataset(config) dataset.load_data(train_file, test_file) # Create Model with specified optimizer and loss function ############################################################## model = model.Transformer(config, len(dataset.vocab)) if torch.cuda.is_available(): model.cuda() model.train() optimizer = optim.Adam(model.parameters(), lr=config.lr) NLLLoss = nn.NLLLoss() model.add_optimizer(optimizer) model.add_loss_op(NLLLoss) ############################################################## train_losses = [] val_accuracies = [] for i in range(config.max_epochs): print("Epoch: {}".format(i)) train_loss, val_accuracy = model.run_epoch(dataset.train_iterator,