Python TextGenerationModel.forward примеры использования

Язык программирования: Python

Пространство имен/Пакет: part2.model

Класс/Тип: TextGenerationModel

Метод/Функция: forward

Примеров на hotexamples.com: 7

Python TextGenerationModel.forward - 7 примеров найдено. Это лучшие примеры Python кода для part2.model.TextGenerationModel.forward, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TextGenerationModel(13)

parameters(10)

forward(7)

to(3)

load_state_dict(2)

sample(2)

state_dict(2)

zero_grad(2)

generate_sentence(1)

reset_lstm(1)

reset_stepper(1)

step(1)

train(1)

Пример #1

Показать файл

Файл: train.py Проект: jelrae/deep_learning_assign

def train(config):

    # Initialize the device which to run the model on
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize the dataset and data loader (note the +1)
    dataset = TextDataset(config.txt_file, config.seq_length)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Initialize the model that we are going to use
    model = TextGenerationModel(config.batch_size, config.seq_length,
                                dataset.vocab_size, config.lstm_num_hidden,
                                config.lstm_num_layers, device)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        #######################################################
        model_out = model.forward(batch_inputs)
        loss = criterion(model_out, batch_targets)
        optimizer.zero_grad()
        loss.backward()
        #######################################################

        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)

        optimizer.step()

        loss = loss.item()

        accuracy = np.average((torch.max(model_out, 1)[1] == batch_targets))

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % config.print_every == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if step == config.sample_every:
            # Generate some sentences by sampling from the model
            pass

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')

Пример #2

Показать файл

def train(config):

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the dataset and data loader (note the +1)
    dataset = TextDataset(config.txt_file, config.seq_length)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Initialize the model that we are going to use
    model = TextGenerationModel(config.batch_size, config.seq_length,
                                dataset.vocab_size, config.lstm_num_hidden,
                                config.lstm_num_layers, device)

    # Setup the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.RMSprop(model.parameters(), config.learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=config.learning_rate_step,
                                          gamma=config.learning_rate_decay)

    accuracy_train = []
    loss_train = []

    if config.samples_out_file != "STDOUT":
        samples_out_file = open(config.samples_out_file, 'w')

    epochs = config.train_steps // len(data_loader) + 1

    print(
        "Will train on {} batches in {} epochs, max {} batches/epoch.".format(
            config.train_steps, epochs, len(data_loader)))

    for epoch in range(epochs):
        data_loader_iter = iter(data_loader)

        if epoch == config.train_steps // len(data_loader):
            batches = config.train_steps % len(data_loader)
        else:
            batches = len(data_loader)

        for step in range(batches):
            batch_inputs, batch_targets = next(data_loader_iter)
            model.zero_grad()

            # Only for time measurement of step through network
            t1 = time.time()

            batch_inputs = F.one_hot(
                batch_inputs,
                num_classes=dataset.vocab_size,
            ).float().to(device)
            batch_targets = batch_targets.to(device)

            optimizer.zero_grad()

            pred, _ = model.forward(batch_inputs)
            loss = criterion(pred.transpose(2, 1), batch_targets)
            accuracy = acc(
                pred.transpose(2, 1),
                F.one_hot(batch_targets,
                          num_classes=dataset.vocab_size).float(),
                dataset.vocab_size)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=config.max_norm)
            optimizer.step()

            # Just for time measurement
            t2 = time.time()
            examples_per_second = config.batch_size / float(t2 - t1)

            scheduler.step()

            if (epoch * len(data_loader) + step + 1) % config.seval_every == 0:
                accuracy_train.append(accuracy)
                loss_train.append(loss.item())

            if (epoch * len(data_loader) + step + 1) % config.print_every == 0:
                print(
                    "[{}] Epoch: {:04d}/{:04d}, Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                    "Accuracy = {:.2f}, Loss = {:.3f}".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M"), epoch + 1,
                        epochs, (epoch * len(data_loader) + step + 1),
                        config.train_steps, config.batch_size,
                        examples_per_second, accuracy, loss))

            if (epoch * len(data_loader) + step +
                    1) % config.sample_every == 0:
                with torch.no_grad():
                    codes = []

                    input_tensor = torch.zeros((1, 1, dataset.vocab_size),
                                               device=device)
                    input_tensor[0, 0,
                                 np.random.randint(0, dataset.vocab_size)] = 1

                    for i in range(config.seq_length - 1):
                        response = model.step(input_tensor)
                        logits = F.log_softmax(config.temp * response, dim=1)
                        dist = torch.distributions.one_hot_categorical.OneHotCategorical(
                            logits=logits)
                        code = dist.sample().argmax().item()
                        input_tensor *= 0
                        input_tensor[0, 0, code] = 1
                        codes.append(code)

                    string = dataset.convert_to_string(codes)
                    model.reset_stepper()

                    if config.samples_out_file != "STDOUT":
                        samples_out_file.write("Step {}: ".format(
                            epoch * len(data_loader) + step + 1) + string +
                                               "\n")
                    else:
                        print(string)

    if config.samples_out_file != "STDOUT":
        samples_out_file.close()

    if config.model_out_file != None:
        torch.save(model, config.model_out_file)

    if config.curves_out_file != None:
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
        fig.suptitle(
            'Training curves for Pytorch 2-layer LSTM.\nFinal loss: {:.4f}. Final accuracy: {:.4f}\nSequence length: {}, Hidden units: {}, LSTM layers: {}, Learning rate: {:.4f}'
            .format(loss_train[-1], accuracy_train[-1], config.seq_length,
                    config.lstm_num_hidden, config.lstm_num_layers,
                    config.learning_rate))
        plt.subplots_adjust(top=0.8)

        ax[0].set_title('Loss')
        ax[0].set_ylabel('Loss value')
        ax[0].set_xlabel('No of batches seen x{}'.format(config.seval_every))
        ax[0].plot(loss_train, label='Train')
        ax[0].legend()

        ax[1].set_title('Accuracy')
        ax[1].set_ylabel('Accuracy value')
        ax[1].set_xlabel('No of batches seen x{}'.format(config.seval_every))
        ax[1].plot(accuracy_train, label='Train')
        ax[1].legend()

        plt.savefig(config.curves_out_file)

    print('Done training.')

Пример #3

Показать файл

Файл: train.py Проект: hinriksnaer/LSTMNetwork

def train(config):

    # Initialize the device which to run the model on
    if torch.cuda.is_available():
        dev = "cuda:0"
    else:
        dev = "cpu"
    # Initialize the device which to run the model on
    device = torch.device(dev)

    # Initialize the dataset and data loader (note the +1)
    dataset = TextDataset(config.txt_file, config.seq_length)  # fixme
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)
    # Initialize the model that we are going to use
    model = TextGenerationModel(config.batch_size, config.seq_length,
                                dataset.vocab_size, config.lstm_num_hidden,
                                config.lstm_num_layers,
                                device).to(device)  # fixme

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()  # fixme
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)  # fixme

    total_steps = 0

    training_summary = [['Total steps', 'Accuracy', 'Loss']]
    sampling_summary = [['Total steps', 'Sentence']]

    while config.train_steps > total_steps:

        for step, (batch_inputs, batch_targets) in enumerate(data_loader):

            total_steps += 1

            if total_steps > config.train_steps: break

            batch_inputs = batch_inputs.to(device)
            batch_targets = batch_targets.to(device)

            # Only for time measurement of step through network
            t1 = time.time()

            #######################################################
            # Add more code here ...
            #######################################################
            batch_inputs = torch.nn.functional.one_hot(batch_inputs,
                                                       dataset.vocab_size)
            optimizer.zero_grad()
            output = model.forward(batch_inputs)
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=config.max_norm)

            loss = 0.0
            for i in range(len(output[0])):
                pred = output[:, i, :]
                target = batch_targets[:, i]

                loss += criterion.forward(pred, target) / len(output[0])

            loss.backward()

            optimizer.step()

            with torch.no_grad():
                accuracy = 0.0

                total_size = 0
                correct = 0

                for i in range(len(output[0])):
                    pred = torch.nn.functional.softmax(output[:, i, :], dim=1)
                    pred = torch.max(pred, 1)[1]

                    correct += pred.eq(batch_targets[:, i]).sum().item()
                    total_size += len(pred)

                accuracy = correct / total_size

                # Just for time measurement
                t2 = time.time()
                examples_per_second = config.batch_size / float(t2 - t1)

                if total_steps % config.print_every == 0:
                    training_summary.append(
                        [total_steps, accuracy,
                         loss.item()])
                    print(
                        "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                        "Accuracy = {:.2f}, Loss = {:.3f}".format(
                            datetime.now().strftime("%Y-%m-%d %H:%M"),
                            total_steps, int(config.train_steps),
                            config.batch_size, examples_per_second, accuracy,
                            loss))

                if total_steps % config.sample_every == 0:
                    # Generate some sentences by sampling from the model
                    text = torch.zeros(
                        (1, 1)).long().random_(0,
                                               dataset.vocab_size).to(device)
                    text = torch.nn.functional.one_hot(text,
                                                       dataset.vocab_size)

                    temprature = config.temprature if config.temprature is not None else 1
                    for i in range(config.seq_length - 1):
                        prediction = model.forward(text)
                        pred = torch.nn.functional.softmax(temprature *
                                                           prediction[:, i, :],
                                                           dim=1)

                        if config.temprature is not None:
                            m = torch.distributions.categorical.Categorical(
                                pred)
                            pred = m.sample()
                        else:
                            pred = torch.max(pred, 1)[1]
                        pred = torch.nn.functional.one_hot(
                            pred, dataset.vocab_size)
                        pred = pred.unsqueeze(0)
                        text = torch.cat((text, pred), 1)
                        stuff = torch.argmax(text[0], 1)
                        sentence = dataset.convert_to_string(stuff.tolist())
                    print(sentence)
                    sampling_summary.append([total_steps, sentence])

                if total_steps == config.train_steps:
                    # If you receive a PyTorch data-loader error, check this bug report:
                    # https://github.com/pytorch/pytorch/pull/9655
                    break

    print('Done training.')
    print('Storing data')

    if not os.path.exists(config.summary_path):
        os.makedirs(config.summary_path)

    training_summary = pd.DataFrame(training_summary)
    sampling_summary = pd.DataFrame(sampling_summary)

    training_summary.to_csv(config.summary_path + "training_summary.csv",
                            header=False,
                            index=False,
                            sep=';')
    sampling_summary.to_csv(config.summary_path + "sampling_summary.csv",
                            header=False,
                            index=False,
                            sep=';')
    print('Finished')

Пример #4

Показать файл

def train(config):
    seed = 42
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    writer = SummaryWriter()

    seq_length = config.seq_length
    batch_size = config.batch_size
    lstm_num_hidden = config.lstm_num_hidden
    lstm_num_layers = config.lstm_num_layers
    dropout_keep_prob = config.dropout_keep_prob

    # Initialize the dataset and data loader (note the +1)
    dataset = TextDataset(config.txt_file, seq_length)
    data_loader = DataLoader(dataset, batch_size, num_workers=1)

    vocab_size = dataset.vocab_size

    # Initialize the model that we are going to use
    model = TextGenerationModel(batch_size, seq_length, vocab_size,
                                lstm_num_hidden, lstm_num_layers,
                                dropout_keep_prob, device)
    model.to(device)

    # Setup the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                             config.learning_rate_step,
                                             config.learning_rate_decay)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        #######################################################
        # Add more code here ...
        #######################################################

        # To onehot represetation of input or embedding => decided for embedding
        # batch_inputs = F.one_hot(batch_inputs, vocab_size).type(torch.FloatTensor).to(device)
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        train_output, _ = model.forward(batch_inputs)

        loss = criterion(train_output, batch_targets)
        accuracy = torch.sum(
            torch.eq(torch.argmax(train_output, dim=1),
                     batch_targets)).item() / (batch_targets.size(0) *
                                               batch_targets.size(1))

        writer.add_scalar('Loss/train', loss.item(), step)
        writer.add_scalar('Accuracy/train', accuracy, step)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step(step)

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % config.print_every == 0:
            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    int(config.train_steps), config.batch_size,
                    examples_per_second, accuracy, loss))

        if step % config.sample_every == 0:
            # Generate some sentences by sampling from the model
            sample_from_model(config, step, model, dataset)

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')
    torch.save(model, "trained_model_part2.pth")
    writer.close()

Пример #5

Показать файл

def train(config):
    
    def acc(predictions, targets):
        accuracy = (predictions.argmax(dim=2) == targets).float().mean()
        return accuracy

    # Initialize the device which to run the model on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print("Device", device)
    print("book:", config.txt_file)
    
    # Initialize the dataset and data loader (note the +1)
    dataset = TextDataset(config.txt_file, config.seq_length)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Initialize the model that we are going to use
    model = TextGenerationModel(config.batch_size, config.seq_length, dataset._vocab_size,
                 config.lstm_num_hidden, config.lstm_num_layers, device).to(device)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    gen_lengths = [20, 30, 100, 200]
    print("temperature:", config.temperature_int)

    all_accuracies = []
    all_losses = []
    all_train_steps = []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):
        batch_inputs = (torch.arange(dataset._vocab_size) == batch_inputs[..., None])  # create one-hot

        # Only for time measurement of step through network
        t1 = time.time()

        # set the data to device
        batch_inputs = batch_inputs.float().to(device)
        batch_targets = batch_targets.to(device)

        out, _ = model.forward(batch_inputs)  # forward pass

        loss = criterion(out.permute(0, 2, 1), batch_targets)  # calculate the loss
        accuracy = acc(out, batch_targets)  # calculate the accuracy

        optimizer.zero_grad() # throw away previous grads

        loss.backward() # calculate new gradients

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm)  # make sure the gradients do not explode

        optimizer.step() # update the weights

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size/float(t2-t1)

        if step % config.print_every == 0:

            print("[{}] Train Step {:04d}/{:04f}, Batch Size = {}, Examples/Sec = {:.2f}, "
                  "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss
            ))

            all_accuracies.append(accuracy.item())
            all_losses.append(loss.item())
            all_train_steps.append(step)


        if step % config.sample_every == 0:

            for gen_length in gen_lengths:
                print("Generated sentence with length of {}".format(gen_length))

                previous = random.randint(0, dataset._vocab_size - 1)  # get the first random letter
                letters = [previous]
                cell = None

                for i in range(gen_length):
                    # create input
                    input = torch.zeros(1, 1, dataset._vocab_size).to(device)
                    input[0, 0, previous] = 1

                    # do a forward pass
                    out, cell = model.forward(input, cell)

                    # get the next letter
                    out = out.squeeze()
                    if config.temperature is True:
                        out *= config.temperature_int
                        out = torch.softmax(out, dim=0)
                        previous = torch.multinomial(out, 1)[0].item()

                    else:
                        previous = out.argmax().item()

                    letters.append(previous)

                # convert to sentence
                sentence = dataset.convert_to_string(letters)
                print(sentence)

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    with open("acc_loss_T_{}.txt".format(config.temperature_int), "w") as output:
        output.write("accuracies \n")
        output.write(str(all_accuracies) + "\n")
        output.write("losses \n")
        output.write(str(all_losses) + "\n")
        output.write("train steps \n")
        output.write(str(all_train_steps) + "\n")


    print('Done training.')

Пример #6

Показать файл

def train(config):
    print(config.train_steps)
    device = torch.device(config.device)

    dataset = TextDataset(config.txt_file, config.seq_length)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)


    model = TextGenerationModel(config.batch_size, config.seq_length, dataset.vocab_size, config.lstm_num_hidden,
                                config.lstm_num_layers,
                                device)


    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)

    generated_text = []
    for epochs in range(10):
        for step, (batch_inputs, batch_targets) in enumerate(data_loader):

            # Only for time measurement of step through network
            t1 = time.time()

            x = torch.stack(batch_inputs, dim=1).to(device)

            # one hot
            encodded_size = list(x.shape)
            encodded_size.append(dataset.vocab_size)
            one_hot = torch.zeros(encodded_size, device=x.device)
            one_hot.scatter_(2, x.unsqueeze(-1), 1)

            targets = torch.stack(batch_targets, dim=1).to(device)

            #######################################################
            predictions = model.forward(one_hot)
            loss = criterion(predictions.transpose(2, 1), targets)
            loss.backward()
            #######################################################

            optimizer.step()
            optimizer.zero_grad()

            loss = loss.item()

            size = targets.shape[0] * targets.shape[1]
            accuracy = torch.sum(predictions.argmax(dim=2) == targets).to(torch.float32) / size
            # Just for time measurement
            t2 = time.time()
            examples_per_second = config.batch_size/float(t2-t1)

            if step % config.print_every == 0:

                print("examples per sec " + str(examples_per_second)+" step "+str(step)+" accuracy "+str(accuracy.item()) +" loss "+str(loss))
                # print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                #       "Accuracy = {:.2f}, Loss = {:.3f}".format(
                #         datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                #         config.train_steps, config.batch_size, examples_per_second,
                #         accuracy, loss
                # ))

                # Generate some sentences by sampling from the model
                random_seed = torch.randint(low=0, high=dataset.vocab_size, size=(1, 1), dtype=torch.long, device=device)

                text_fifteen, text, temp_nine, temp_five, temp_one = generator(model=model, seed=random_seed, length=config.seq_length, dataset=dataset)

                generated_text.append(text_fifteen)
                generated_text.append(text)
                generated_text.append(temp_nine)
                generated_text.append(temp_five)
                generated_text.append(temp_one)

                print("temp 1.5: " + generated_text[-5])
                print("temp 1: " + generated_text[-4])
                print("temp 0.9: " + generated_text[-3])
                print("temp 0.5: " + generated_text[-2])
                print("temp 0.2: " + generated_text[-1])
                print("")

                file = open("generated.txt", "a")
                file.write("beta 1.5: " + generated_text[-5] + "\n")
                file.write("beta 1: " + generated_text[-4] + "\n")
                file.write("beta 0.9: " + generated_text[-3] + "\n")
                file.write("beta 0.5: " + generated_text[-2] + "\n")
                file.write("beta 0.2: " + generated_text[-1] + "\n")
                file.write("")
                file.close()

            if step == config.sample_every:
                # Generate some sentences by sampling from the model
                pass

            if step == 30000:
                # If you receive a PyTorch data-loader error, check this bug report:
                # https://github.com/pytorch/pytorch/pull/9655
                break


    print('Done training.')

Пример #7

Показать файл

Файл: train.py Проект: freelectron/dl-course-work

def train(config):
    """
    """

    # some additional vars
    learning_rate = config.learning_rate

    # TODO: Initialize the device which to run the model on
    device = 'cpu'
    device = torch.device(device)

    dataset = TextDataset(config.txt_file, config.seq_length)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Initialize the model that we are going to use
    model = TextGenerationModel(vocabulary_size=dataset.vocab_size, device='cpu', **config.__dict__)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

    # evaluation
    loss_list = list()
    accuracy_list = list()

    mean_loss_list = list()
    mean_accuracy_list = list()

    step = 0
    epoch = 0
    steps_total = 0

    text_greedy_generated = dict()
    text_random_generated = dict()

    while steps_total < config.train_steps:
        epoch += 1
        for step, (X_transposed, y_transposed) in enumerate(data_loader):
            steps_total = step * epoch
            # Only for time measurement of step through network
            t1 = time.time()

            X_batch = torch.stack(X_transposed).t()
            Y_batch = torch.stack(y_transposed).t()

            X = X_batch.to(device)
            y = Y_batch.to(device)

            X = torch.zeros(len(X), config.seq_length, dataset.vocab_size).scatter_(2, X.unsqueeze(2), 1)

            optimizer.zero_grad()
            outputs = model.forward(X).type(dtype)

            # Add more code here ...
            loss_current = criterion(outputs.transpose(2, 1), y)
            loss_current.backward(retain_graph=True)
            optimizer.step()

            # evaluation
            loss = loss_current.detach().item()
            accuracy = (outputs.argmax(dim=2) == y.long()).sum().float() / (float(y.shape[0]) * float(y.shape[1]))

            # Just for time measurement
            t2 = time.time()
            examples_per_second = config.batch_size/float(t2-t1)

            loss_list.append(loss)
            accuracy_list.append(accuracy)

            if step % config.print_every == 0:

                mean_loss_list.append(np.mean(loss_list[-50:]))
                mean_accuracy_list.append(np.mean(accuracy_list[-50:]))

                print("[{}] Train Step {}/{}, Batch Size = {}, Examples/Sec = {:.2f}, "
                      "Accuracy = {:.2f}, Loss = {:.3f}".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M"), steps_total,
                        config.train_steps, config.batch_size, examples_per_second,
                        accuracy, loss
                ))

                # Text generation
                if step % config.sample_every == 0:
                    # Generate some sentences by sampling from the model
                    text_greedy, text_random = text_generator(model, config.seq_length, 0.2, dataset, device)
                    text_greedy_generated[len(mean_accuracy_list)] = text_greedy
                    text_random_generated[len(mean_accuracy_list)] = text_random
                    print(text_greedy, len(text_greedy))
                    print(text_random, len(text_random))

                # if step == config.train_steps:
                #     # If you receive a PyTorch data-loader error, check this bug report:
                #     # https://github.com/pytorch/pytorch/pull/9655
                    if step > config.train_steps:
                        break

    print('Done training.')
    return mean_loss_list, mean_accuracy_list, text_greedy_generated, text_random_generated