def evaluate(model,
             data,
             loss_function,
             word_to_ix,
             all_losses_dev,
             name='dev'):
    model.eval()
    avg_loss = 0.0
    truth_res = []
    pred_res = []
    #print(data)
    #for sent, label in data:
    loss_plot = 0.0
    count = 0.0
    for sentwords, dgr in data:
        count += 1
        dgr = dataLoaderRegresser.prepare_degree(dgr)
        #truth_res.append(label_to_ix[label])
        truth_res.append(dgr)
        #detaching it from its history on the last instance
        model.hidden = model.init_hidden()
        sent = dataLoaderRegresser.prepare_sequence_pretrainedVec(sentwords)
        #label = dataLoader.prepare_label(label, label_to_ix)
        pred = model(sent)
        #pred_label = pred.data.max(1)[1].numpy()
        #pred_res.append(pred_label)
        ######print('->gold-degree %.4f, predicted-degree %.4f %s' % (dgr.item(), pred.item(), sentwords))
        pred_res.append(pred)
        #model.zero_grad() # Note that we don't need to keep this when evaluating the model
        #loss = loss_function(pred, label)
        loss = loss_function(pred, dgr)
        #avg_loss += loss.data[0]
        avg_loss += loss.item()
        loss_plot += loss.item()
        plotEvery = 10.0
        if count % plotEvery == 0:
            #all_losses_dev.append(loss_plot/plotEvery)
            loss_plot = 0.0
    avg_loss /= len(data)
    all_losses_dev.append(avg_loss)
    #acc = get_accuracy(truth_res, pred_res)
    #print(name + ' avg_loss: %g train acc: %g' % (avg_loss, acc))
    print(name + ' avg_loss: %g' % avg_loss)
    return avg_loss, pred_res, all_losses_dev
def train_epoch(model, train_data, loss_function, optimizer, word_to_ix, i,
                all_losses):
    model.train()
    # https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch
    # model.train() tells your model that you are training the model.
    # So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedure know what is going on and hence can behave accordingly.
    # You can call either model.eval() or model.train(mode=False) to tell that you are testing.

    avg_loss = 0.0
    count = 0
    truth_res = []
    pred_res = []

    loss_plot = 0.0
    #for sent, label in train_data:
    random.shuffle(train_data)
    train_data = train_data[:int(0.7 * len(train_data))]
    for sentwords, dgr in train_data:
        dgr = dataLoaderRegresser.prepare_degree(dgr)
        #truth_res.append(label_to_ix[label])
        truth_res.append(dgr)
        #detaching it from its history on the last instance
        model.hidden = model.init_hidden()
        sent = dataLoaderRegresser.prepare_sequence_pretrainedVec(sentwords)
        #label = dataLoader.prepare_label(label, label_to_ix)
        pred = model(sent)
        ######print('->gold-degree %.4f, predicted-degree %.4f %s' % (dgr.item(), pred.item(), sentwords))
        #pred_label = pred.data.max(1)[1].numpy()
        #pred_res.append(pred_label)
        #print('->pred:', pred)
        #print('->dgr:', dgr)
        pred_res.append(pred)
        #model.zero_grad() # set gradients of all model parameters to zero; same as optimizer.zero_grad()
        optimizer.zero_grad()
        # https://discuss.pytorch.org/t/model-zero-grad-or-optimizer-zero-grad/28426/2
        # model.zero_grad() vs optimizer.zero_grad()
        # if optimizer = optim.SGD(model.parameters()), model.zero_grad() and optimizer.zero_grad are the same.
        # They are still the same whether the optimizer is SGD, Adam, RMSProp etc.
        #loss = loss_function(pred, label)
        loss = loss_function(pred, dgr)
        #print('->loss', loss.item())
        #avg_loss += loss.data[0]
        avg_loss += loss.item(
        )  # https://github.com/pytorch/pytorch/issues/6061
        loss_plot += loss.item()
        count += 1
        printEvery = 50.0
        #if count % 500 == 0: #print out every 500 sentences
        if count % printEvery == 0:  #print out every 50 sentences
            #print('epoch: %d iterations: %d loss: %g' % (i, count, loss.data[0]))
            #all_losses.append(loss_plot/printEvery)
            loss_plot = 0.0
            print('epoch: %d iterations: %d loss: %g' %
                  (i, count, loss_plot / printEvery))
        loss.backward(
        )  # Calling .backward() multiple times accumulates the gradient (by addition) for each parameter.
        # This is why you should call optimizer.zero_grad() after each .step() call.
        # Note that following the thirst .backward call, a second call is only possible after you have performed another forward pass.
        optimizer.step(
        )  # It performs a parameter update based on the current gradient (stored in .grad attribute of a parameter) and the update rule.
    avg_loss /= len(train_data)
    all_losses.append(avg_loss)
    print('epoch: %d done!\ntrain avg_loss: %g' % (i, avg_loss))
    # %g
    # https://stackoverflow.com/questions/30580481/why-does-e-behave-different-than-g-in-format-strings
    return all_losses, model