示例#1
0
    def __init__(self, model=model.Transformer(), lenpen=0):
        if isinstance(model, torch.nn.Module):
            self.model = model
        elif isinstance(model, list) and all(
                isinstance(m, torch.nn.Module) for m in model):
            self.model = self.average_models(model)
        else:
            raise ValueError(
                'Search object requires nn.Module or list of nn.Module.')

        self.lenpen = lenpen
示例#2
0
    def model_fn(features, mode):
        xs = [features["in_ids"], features["length"]]

        m = model.Transformer()
        if mode == tf.estimator.ModeKeys.PREDICT:
            predict = m.predict(xs)
            prediction = {"y_hat": predict}
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     predictions=prediction)
            return output_spec
        ys = [features["out_ids"], features["y"]]
        loss, train_op = m.get_loss_train_op(xs, ys)
        train_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                loss=loss,
                                                train_op=train_op)
        return train_spec
示例#3
0
    def __init__(self,
                 src_lang: str = 'en',
                 trg_lang: str = 'de',
                 vocab_size: int = 37000,
                 d_model: int = 512,
                 nhead: int = 8,
                 num_encoder_layers: int = 6,
                 num_decoder_layers: int = 8,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 activation: str = 'relu',
                 warmup: int = 4000,
                 bpe_file: str = '../data/wmt14.en-de/share.bpe.37000',
                 lenpen: float = 0.6,
                 beam_size: int = 4,
                 ckpt_steps: int = 1500):
        super().__init__()

        self.src_lang = src_lang
        self.trg_lang = trg_lang
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.nhead = nhead
        self.num_encoder_layers = num_encoder_layers
        self.num_decoder_layers = num_decoder_layers
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        self.activation = activation
        self.warmup = warmup
        self.bpe_file = bpe_file
        self.lenpen = lenpen
        self.beam_size = beam_size
        self.ckpt_steps = ckpt_steps

        self.model = model.Transformer(
            vocab_size=self.vocab_size,
            d_model=self.d_model,
            nhead=self.nhead,
            num_encoder_layers=self.num_encoder_layers,
            num_decoder_layers=self.num_decoder_layers,
            dim_feedforward=self.dim_feedforward,
            dropout=self.dropout,
            activation=self.activation,
        )
        self.train_acc = pl.metrics.Accuracy()
        self.val_acc = pl.metrics.Accuracy()
        self.search = search.Search(self.model, lenpen=self.lenpen)
示例#4
0
    def inference(self):
        """
        forward propagation
        :return: labels for each sample
        """
        v = tf.Variable(tf.truncated_normal(shape=[self.p, self.k],
                                            mean=0,
                                            stddev=0.01),
                        dtype='float32')

        # three-hidden-layer neural network, network shape of (200-200-200)
        with tf.variable_scope('DNN', reuse=False):
            # embedding layer
            y_embedding_input = tf.gather(v, self.feature_inds)
            Model = models.Transformer(hparams.Hparams().parser.parse_args())
            result = Model.result(y_embedding_input)
            # first hidden layer
        # add FM output and DNN output
        self.y_out = result
        self.y_out_prob = tf.sigmoid(self.y_out)
示例#5
0
val_data = batchify(corpus.valid, eval_batch_size, args)
test_data = batchify(corpus.test, test_batch_size, args)

###############################################################################
# Build the model
###############################################################################

from splitcross import SplitCrossEntropyLoss
criterion = None

ntokens = len(corpus.dictionary)
print('Total number of tokens:', ntokens)
#model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
#model = model.BoomRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
model = model.Transformer(args.model, ntokens, args.emsize, args.nhid,
                          args.nlayers, args.dropout, args.dropouth,
                          args.dropouti, args.dropoute, args.wdrop, args.tied)
#model = model.AttnRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
#model = model.RecAttn(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
#model = model.LNRNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
#model = model.LNRR(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
###
if args.resume:
    print('Resuming model ...')
    model_load(args.resume)
    #optimizer.param_groups[0]['lr'] = args.lr
    model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
    #if args.wdrop:
    #    from weight_drop import WeightDrop
    #    for rnn in model.rnns:
    #        if type(rnn) == WeightDrop: rnn.dropout = args.wdrop
示例#6
0
if cl_args.debug:
    pdb.set_trace()

if cl_args.epochs < 1:
    print('Invalid epochs: ', cl_args.epochs)
    exit()

# Get the datasets
dataset = data.Data(cl_args.lang_code, cl_args.reverse)
train_dataset, test_dataset, val_dataset = dataset.get_dataset(cl_args.short_test)

# Transformer network
transformer = model.Transformer(param.NUM_LAYERS, param.D_MODEL, param.NUM_HEADS, param.DFF,
    input_vocab_size = dataset.inp_vocab_size,
    target_vocab_size = dataset.tar_vocab_size, 
    pe_input = param.PAD_SIZE, 
    pe_target = param.PAD_SIZE,
    rate=param.DROPOUT
)

train_loss = tf.metrics.Mean(name='train_loss')
optimizer = utils.optimizer

# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.float32),
    tf.TensorSpec(shape=(None, None), dtype=tf.float32),
            "WARNING: Using the moviecorpus requires n_embd_d = 2, but is set to {}. It was automatically set to "
            "2! Please check your configuration!".format(params.clf_pipes))
        params.n_embd_d = 2
    if params.n_acc_batch > 1 and params.n_batch_train % params.n_acc_batch != 0:
        raise ValueError(
            "Gradient accumulation active, due to n_acc_batch = {}. n_batch_train is {} which is not "
            "divisible through n_acc_batch without rest, but must be!")
    elif params.n_acc_batch > 1:
        params.n_batch_train = int(params.n_batch_train / params.n_acc_batch)
        params.gradient_accumulation = True
    else:
        params.gradient_accumulation = False

    # --- generate model as tensorflow graph (train) ------------------------------------------------------------------
    print("Generating model ...")
    transformer_decoder = model.Transformer(params=params,
                                            use_encoder=params.use_encoder)
    if params.use_encoder is False:  # original decoder model
        X_train = tf.placeholder(
            tf.int32,
            [None, params.clf_pipes, params.n_ctx, params.n_embd_d + 1])
    else:  # with encoder-decoder model
        X_train = tf.placeholder(
            tf.int32,
            [None, 2, params.clf_pipes, params.n_ctx, params.n_embd_d + 1])
    M_train = tf.placeholder(tf.float32,
                             [None, params.clf_pipes, params.n_ctx])
    Y_train = tf.placeholder(tf.int32, [None])
    """
    This just defines and adds the node, not perform actual training. Training is performed in the train loop below
    - returns the result from all four (two) gpus after training and loss calculated (gradient descent also performed)
    
示例#8
0
        conv_width=7,
        max_relative_distance=24,
    )
    architecture_specs = dict(
        num_subwords=8000,
        num_speakers=2,
        d_model=512,
        num_decoder_layers=7,
        num_encoder_layers=4,
        num_highway_layers=2,
        highway_dropout=0.1,
        embedding_dropout=0.1,
    )
    tf.keras.backend.clear_session()

    chatbot_model = model.Transformer(
        **architecture_specs, transformer_layer_kwargs=transformer_specs)
    #%%
    test_data_sample = next(iter(test_pipeline))

    chatbot_model(test_data_sample[0])

    word2v_embeddings = np.load('./model_components/w2v_embeddings.npy')

    chatbot_model.embedding_layer.set_weights([word2v_embeddings])

    chatbot_model.summary()

    chatbot = chatbot_estimator.ChatBotTrainer(subword_processor,
                                               chatbot_model,
                                               model.TransformerOptimizer(
                                                   0.001,
示例#9
0
def main():
    checkpoint_path = "./checkpoints{}/train".format(args.ckpt)

    transformer = model.Transformer(args.num_enc,
                                    args.num_dec,
                                    args.d_model,
                                    args.num_heads,
                                    args.dff,
                                    args.max_sequence_length,
                                    rate=args.dropout_rate)
    ckpt = tf.train.Checkpoint(transformer=transformer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=5)
    #this_model = ckpt.restore(ckpt_manager.latest_checkpoint)

    # source magniutde of the STFT
    X_spec = np.load(args.enc_inp)  # (batch, d_model, seq_len)
    X_spec = np.transpose(X_spec, (0, 2, 1))  # (batch, seq_len, d_model)
    X_spec = X_spec[:, :, :-1]  # (batch, seq_len, d_model - 1)

    # real magniutde of the STFT. This is from target speaker. Only using for listening to compare target and predict.
    Y_spec = np.load(args.tar_inp)
    Y_spec = Y_spec[:, :-1, 1:]  # (batch, d_model, seq)

    # test phase of the STFT
    X_phase = np.load(args.enc_phase_inp)
    X_phase = X_phase[:, :-1, :]  # (batch, d_model, seq)

    # real phase of the STFT. This is from target speaker. Only using for listening to compare target and predict.
    Y_phase = np.load(args.tar_phase_inp)
    Y_phase = Y_phase[:, :-1, 1:]  # (batch, d_model, seq)

    name = 'ckpt={}'.format(args.ckpt)

    save_dir = './result/'
    for i in range(len(X_spec)):

        inp_spec = X_spec[i]  # max_seq_len, d_model
        inp_pha = X_phase[i]  # d_model, seq_len

        predict_spec, attention_weights = evaluate(inp_spec, transformer)
        print("after predict, spec shape {}".format(np.shape(predict_spec)))

        # for make attention alignment map
        spec_t = inp_spec.T  # d_model, seq_len
        idx_spec = np.argwhere(np.diff(np.r_[False, spec_t[0], False]))
        find_zero_spec = np.squeeze(idx_spec)
        zero_cnt = find_zero_spec[-1]

        for x in range(6):

            plot = 'decoder_layer{}_block2'.format(x + 1)
            plot_attention_weights(attention_weights, plot, i + 1,
                                   zero_cnt)  # spec plot

        predict_spec = predict_spec[1:, :]  # (seq_len, d_model)
        predict_spec = np.transpose(predict_spec, (1, 0))  # (d_model, seq_len)

        # y_hat wav, fig save
        concat = predict_spec * inp_pha
        save_name = name + '_{}th'.format(i)
        for_save = os.path.join(save_dir, name)
        if not os.path.exists(for_save):
            os.makedirs(for_save)
        recover(concat, for_save, save_name)

        np_save_dir = 'np_file'
        np_dir = os.path.join(for_save, np_save_dir)
        if not os.path.exists(np_dir):
            os.makedirs(np_dir)
        # y_hat np file save
        save_np = '{}th_predict.result'.format(i)
        np_final_predict = os.path.join(np_dir, save_np)
        np.save(np_final_predict, concat)

        ########### check #######, x_real plot
        # x_real np file save
        x_real = inp_spec.T * X_phase[i]
        save_np_x_real = '{}th_x_real.result'.format(i)
        np_final_x_real = os.path.join(np_dir, save_np_x_real)
        np.save(np_final_x_real, x_real)

        # x_real wav, fig file save
        save_name_real = 'x_real_' + name + '_{}th'.format(i)
        for_save_real = os.path.join(save_dir, name)
        if not os.path.exists(for_save_real):
            os.makedirs(for_save_real)
        # np.save(for_save_real, real)
        recover(x_real, for_save_real, save_name_real)

        # y_real np file save
        y_real = Y_spec[i] * Y_phase[i]
        save_np_y_real = '{}th_y_real.result'.format(i)
        np_final_y_real = os.path.join(np_dir, save_np_y_real)
        np.save(np_final_y_real, y_real)

        # y_real wav, fig file save
        save_name_real = 'y_real_' + name + '_{}th'.format(i)
        for_save_real = os.path.join(save_dir, name)
        if not os.path.exists(for_save_real):
            os.makedirs(for_save_real)
        recover(y_real, for_save_real, save_name_real)
    decoder = model.Decoder(emb_size=args.emb_size,
                            hid_size=args.hid_size,
                            vocab_size=len(i2w),
                            num_layers=args.num_layers,
                            use_attn=args.use_attn)

    model = model.Seq2Seq(encoder=encoder,
                          decoder=decoder,
                          i2w=i2w,
                          use_knowledge=args.use_knowledge,
                          args=args,
                          test=True).cuda()
elif args.transformer:
    model = model.Transformer(i2w=i2w,
                              use_knowledge=args.use_knowledge,
                              args=args,
                              test=True).cuda()

# TEST EVALUATION
best_epoch = args.epoch
model.load("{0}/model_{1}.bin".format(args.save_path, best_epoch))
model.transformer.eval()

# Iterate over batches
num_batches = math.ceil(len(valid_freq) / args.batch_size)
cum_loss = 0
cum_words = 0
predicted_sentences = []
indices = list(range(len(valid_freq)))
for batch in tqdm(range(num_batches)):
    # Prepare batch
示例#11
0
def main():
    # load dataset here

    spec_enc_inp = np.load(args.enc_inp)
    spec_enc_inp = spec_enc_inp.astype('float32')
    
    spec_dec_inp = np.load(args.dec_inp)
    spec_dec_inp = spec_dec_inp.astype('float32')
    
    spec_tar_inp = np.load(args.tar_inp)
    spec_tar_inp = spec_tar_inp.astype('float32')
    

    spec_enc_inp = spec_enc_inp[:, :-1, :] # batch, d_model, seq_len
    spec_dec_inp = spec_dec_inp[:, :-1, :] # batch, d_model, seq_len
    spec_tar_inp = spec_tar_inp[:, :-1, :] # batch, d_model, seq_len

    enc_inp_spec = np.transpose(spec_enc_inp, (0, 2, 1)) # batch, seq_len, d_model 
    dec_inp_spec = np.transpose(spec_dec_inp, (0, 2, 1)) # batch, seq_len, d_model
    tar_inp_spec = np.transpose(spec_tar_inp, (0, 2, 1)) # batch, seq_len, d_model
    
    print("enc_inp_spec shape {} dec_inp_spec shape {} tar_inp_spec shape {}".format(np.shape(enc_inp_spec), np.shape(dec_inp_spec), np.shape(tar_inp_spec)))

    ckpt_path = args.ckpt

    batch_size = args.batch_size
    buffer_size = 80
    EPOCHS = args.epochs

    train_dataset = input_fn(enc_inp_spec, dec_inp_spec, tar_inp_spec, batch_size, buffer_size)

    train_loss = tf.keras.metrics.Mean(name='train_loss')    
    
    transformer = model.Transformer(args.num_enc, args.num_dec, args.d_model, args.num_heads, args.dff, args.max_sequence_length, rate=args.dropout_rate)    
    
    if args.lr == 0:
        lr_schedule = model.CustomSchedule(args.d_model)
        print("lr is {}. Using schedule sampling learning rate".format(args.lr))
    else:
        initial_learning_rate = args.lr
        
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=4000,
        decay_rate=0.96,
        staircase=True)
        print("lr is not schedule! We use {}".format(args.lr))

    optimizer = tf.keras.optimizers.Adam(lr_schedule, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    checkpoint_path = "./checkpoints{}/train".format(args.ckpt)

    ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=None)

    # writer = tf.summary.create_file_writer("/tmp/mylogs/eager")
    logdir = "logs/scalars{}/".format(args.ckpt) + datetime.now().strftime("%Y%m%d-%H%M%S")

    file_writer = tf.summary.create_file_writer(logdir + "/metrics")
    file_writer.set_as_default()

    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')

    train_step_signature = [
        tf.TensorSpec(shape=(None, None, None), dtype=tf.float32),
        tf.TensorSpec(shape=(None, None, None), dtype=tf.float32),
        tf.TensorSpec(shape=(None, None, None), dtype=tf.float32),
    ]

    @tf.function(input_signature=train_step_signature)
    def train_step(inp_spec, dec_spec, tar_spec):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp_spec, dec_spec)

        with tf.GradientTape() as tape:
            predict_spec, attention_weight = transformer(inp_spec, dec_spec, True, enc_padding_mask, combined_mask, dec_padding_mask)
            
            loss = loss_function(tar_spec, predict_spec)

        gradient = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradient, transformer.trainable_variables))

        train_loss(loss)

        return predict_spec, attention_weight


    train_begin = 0
    train_elapsed = 0
    train_start = train_begin = time.time()
    for epoch in range(EPOCHS):
        start = epoch_begin = time.time()
        
        train_loss.reset_states()        

        # inp -> man, tar -> woman
        for (batch, (inp_spec, dec_spec, tar_spec)) in enumerate(train_dataset):

            name_before = 'before_predict_epoch={}'.format(int(epoch))
            result_before = inp_spec[0]
            result_before = np.transpose(result_before, (1, 0))

            result, attention_weight = train_step(inp_spec, dec_spec, tar_spec)

            if batch % 20 == 0:
                current = time.time()
                elapsed = current - start
                #epoch_elapsed = (current - epoch_begin) / 60.0
                #train_elapsed = (current - train_begin) / 3600.0
                train_elapsed = (current - train_start) / 3600.0 
                #print('Epoch {} Batch {} Loss {:.4f}, train elapsed: {:.2f}h '.format(epoch + 1, batch, train_loss.result(), train_elapsed))

        if (epoch + 1) % 20 == 0:
            ckpt_save_path = ckpt_manager.save()
            print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
                                                                ckpt_save_path))

        print('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

        tf.summary.scalar('loss', data=train_loss.result(), step=epoch)

        print('Time taken for 1 epoch: {} secs, elapsed: {:.2f}h \n'.format(time.time() - start, train_elapsed))

        if epoch % 20 == 0:
            spec_t = inp_spec[0]
            spec_t = spec_t.numpy()
            spec_t = spec_t.T

            spec_tar = dec_spec[0]
            spec_tar = spec_tar.numpy()
            spec_tar_t = spec_tar.T
            
            idx_spec = np.argwhere(np.diff(np.r_[False, spec_t[0], False]))            
            idx_tar = np.argwhere(np.diff(np.r_[False, spec_tar_t[0], False]))
            
            find_zero_spec = np.squeeze(idx_spec)            
            find_zero_spec_tar = np.squeeze(idx_tar)
            
            zero_cnt = find_zero_spec[-1]            
            zero_cnt_tar = find_zero_spec_tar[-1]
            

            for x in range(6):
                plot = 'decoder_layer{}_block2'.format(x + 1)

                plot_attention_weights(attention_weight, plot, epoch, zero_cnt, zero_cnt_tar)  # spec plot attention weights

        if epoch % 5 == 0: # Get results from trainset every 5 epochs
            epc = int(epoch)
            name_after = 'after_predict_epoch={}'.format(epc)
            result_after = result[0]
            result_after = np.transpose(result_after, (1, 0))

            # The dataset before training (original input)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(librosa.amplitude_to_db(result_before, ref=np.max), y_axis='hz', x_axis='time',
                                     sr=16000, hop_length=args.hop)
            plt.title(name_before)
            plt.colorbar(format='%+2.0f dB')
            plt.tight_layout()
            fig_save_dir = './result/' + ckpt_path + '_fig/'
            if not os.path.exists(fig_save_dir):
                os.makedirs(fig_save_dir)
            plt.savefig(fig_save_dir + name_before + '.png')
            plt.cla()
            plt.close()

            make_wav = librosa.istft(result_before, hop_length=args.hop)
            wav_save_dir = './result/' + ckpt_path + '_wav/'
            if not os.path.exists(wav_save_dir):
                os.makedirs(wav_save_dir)
            sf.write(wav_save_dir + name_before + '.wav', make_wav, 16000, format='WAV', endian='LITTLE',
                     subtype='PCM_16')

            # Results after training from trainset (y_hat)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(librosa.amplitude_to_db(result_after, ref=np.max), y_axis='hz', x_axis='time',
                                     sr=16000, hop_length=args.hop)
            plt.title(name_after)
            plt.colorbar(format='%+2.0f dB')
            plt.tight_layout()
            plt.savefig(fig_save_dir + name_after + '.png')
            plt.cla()
            plt.close()

            make_wav = librosa.istft(result_after, hop_length=args.hop)
            sf.write(wav_save_dir + name_after + '.wav', make_wav, 16000, format='WAV', endian='LITTLE',
                     subtype='PCM_16')

            # Real input (source)
            save_tar = tar_spec[0]
            save_tar = np.transpose(save_tar, (1, 0))
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(librosa.amplitude_to_db(save_tar, ref=np.max), y_axis='hz', x_axis='time',
                                     sr=16000, hop_length=args.hop)
            real_name = 'real_epoch={}'.format(int(epoch))
            plt.title(real_name)
            plt.colorbar(format='%+2.0f dB')
            plt.tight_layout()
            fig_save_dir = './result/' + ckpt_path + '_fig/'
            if not os.path.exists(fig_save_dir):
                os.makedirs(fig_save_dir)
            plt.savefig(fig_save_dir + real_name + '.png')
            plt.cla()
            plt.close()

            make_wav = librosa.istft(save_tar, hop_length=args.hop)

            wav_save_dir = './result/' + ckpt_path + '_wav/'
            if not os.path.exists(wav_save_dir):
                os.makedirs(wav_save_dir)
            sf.write(wav_save_dir + real_name + '.wav', make_wav, 16000, format='WAV', endian='LITTLE',
                     subtype='PCM_16')

            # Numpy file before training
            np_save_dir = './result/' + ckpt_path + '_np_file/'
            if not os.path.exists(np_save_dir):
                os.makedirs(np_save_dir)
            np.save(np_save_dir + name_before, result_before)

            # Numpy file after training trainset
            np_save_dir = './result/' + ckpt_path + '_np_file/'
            if not os.path.exists(np_save_dir):
                os.makedirs(np_save_dir)
            np.save(np_save_dir + name_after, result_after)

            # Real Numpy file (source)
            np_save_dir = './result/' + ckpt_path + '_np_file/'
            if not os.path.exists(np_save_dir):
                os.makedirs(np_save_dir)
            real_name = 'y_real_epoch={}'.format(epc)

            np.save(np_save_dir + real_name, save_tar)
示例#12
0
    config = config.Config()

    train_file = '../data/ag_news.train'
    if len(sys.argv) > 2:
        train_file = sys.argv[1]

    test_file = '../data/ag_news.test'
    if len(sys.argv) > 3:
        test_file = sys.argv[2]

    dataset = utils.Dataset(config)
    dataset.load_data(train_file, test_file)

    # Create Model with specified optimizer and loss function
    ##############################################################
    model = model.Transformer(config, len(dataset.vocab))
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    NLLLoss = nn.NLLLoss()
    model.add_optimizer(optimizer)
    model.add_loss_op(NLLLoss)
    ##############################################################

    train_losses = []
    val_accuracies = []

    for i in range(config.max_epochs):
        print("Epoch: {}".format(i))
        train_loss, val_accuracy = model.run_epoch(dataset.train_iterator,