continue
                    else:
                        loss += criterion(decoder_output[k], y[:, k, di])
                else:
                    loss += criterion(decoder_output[k], y[:, k, di])

            y = y.squeeze(0)
            # if int(topi[0]) == 0:
            #     break
        loss.backward()
        encoder_optimizer1.step()
        decoder_optimizer1.step()
        return loss.item()


encoder = densenet121()

pthfile = r'densenet121-a639ec97.pth'
pretrained_dict = torch.load(pthfile)

#pretrained_dict = torch.hub.load('pytorch/vision:v0.4.2', 'densenet121', pretrained=True)
#pretrained_dict.eval()

encoder_dict = encoder.state_dict()
pretrained_dict = {
    k: v
    for k, v in pretrained_dict.items() if k in encoder_dict
}
encoder_dict.update(pretrained_dict)
encoder.load_state_dict(encoder_dict)
def for_test(x_t):
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    h_mask_t = []
    w_mask_t = []
    encoder = densenet121()
    attn_decoder1 = AttnDecoderRNN(hidden_size, 256, dropout_p=0.5)

    encoder = torch.nn.DataParallel(encoder, device_ids=device_ids)
    attn_decoder1 = torch.nn.DataParallel(attn_decoder1, device_ids=device_ids)
    if torch.cuda.is_available():
        encoder = encoder.cuda()
        attn_decoder1 = attn_decoder1.cuda()

    encoder.load_state_dict(
        torch.load(
            '../model/encoder_lr0.00000_GN_te1_d05_SGD_bs6_mask_conv_bn_b_xavier.pkl',
            map_location=device))
    attn_decoder1.load_state_dict(
        torch.load(
            '../model/attn_decoder_lr0.00000_GN_te1_d05_SGD_bs6_mask_conv_bn_b_xavier.pkl',
            map_location=device))

    encoder.eval()
    attn_decoder1.eval()
    x_t = Variable(x_t)

    if torch.cuda.is_available():
        x_t = Variable(x_t.cuda())

    x_mask = torch.ones(x_t.size()[0],
                        x_t.size()[1],
                        x_t.size()[2],
                        x_t.size()[3])
    if torch.cuda.is_available():
        x_mask = x_mask.cuda()

    x_t = torch.cat((x_t, x_mask), dim=1)
    x_real_high = x_t.size()[2]
    x_real_width = x_t.size()[3]
    h_mask_t.append(int(x_real_high))
    w_mask_t.append(int(x_real_width))
    x_real = x_t[0][0].view(x_real_high, x_real_width)
    output_highfeature_t = encoder(x_t)

    x_mean_t = torch.mean(output_highfeature_t)
    x_mean_t = float(x_mean_t)
    output_area_t1 = output_highfeature_t.size()
    output_area_t = output_area_t1[3]
    dense_input = output_area_t1[2]

    decoder_input_t = torch.LongTensor([111] * batch_size_t)
    if torch.cuda.is_available():
        decoder_input_t = decoder_input_t.cuda()

    decoder_hidden_t = torch.randn(batch_size_t, 1, hidden_size)
    if torch.cuda.is_available():
        decoder_hidden_t = decoder_hidden_t.cuda()
    # nn.init.xavier_uniform_(decoder_hidden_t)
    decoder_hidden_t = decoder_hidden_t * x_mean_t
    decoder_hidden_t = torch.tanh(decoder_hidden_t)

    prediction = torch.zeros(batch_size_t, maxlen)
    # label = torch.zeros(batch_size_t,maxlen)
    prediction_sub = []
    label_sub = []

    decoder_attention_t = torch.zeros(batch_size_t, 1, dense_input,
                                      output_area_t)
    attention_sum_t = torch.zeros(batch_size_t, 1, dense_input, output_area_t)
    if torch.cuda.is_available():
        decoder_attention_t = decoder_attention_t.cuda()
        attention_sum_t = attention_sum_t.cuda()
    decoder_attention_t_cat = []

    for i in range(maxlen):
        decoder_output, decoder_hidden_t, decoder_attention_t, attention_sum_t = attn_decoder1(
            decoder_input_t, decoder_hidden_t, output_highfeature_t,
            output_area_t, attention_sum_t, decoder_attention_t, dense_input,
            batch_size_t, h_mask_t, w_mask_t, device_ids)

        decoder_attention_t_cat.append(
            decoder_attention_t[0].data.cpu().numpy())
        topv, topi = torch.max(decoder_output, 2)
        if torch.sum(topi) == 0:
            break
        decoder_input_t = topi
        decoder_input_t = decoder_input_t.view(batch_size_t)

        # prediction
        prediction[:, i] = decoder_input_t

    k = numpy.array(decoder_attention_t_cat)
    x_real = numpy.array(x_real.cpu().data)

    prediction = prediction[0]

    prediction_real = []
    for ir in range(len(prediction)):
        if int(prediction[ir]) == 0:
            break
        prediction_real.append(worddicts_r[int(prediction[ir])])
    prediction_real.append('<eol>')

    prediction_real_show = numpy.array(prediction_real)

    return k, prediction_real_show