Пример #1
0
            0
        )
    )

    # Construct the network
    net.layer_opts['filter_shape'] = (3,1,8,8)
    net.content['l1'] = ConvLayer(net, net.content['input'])

    net.layer_opts['filter_shape'] = (3,3,1,1)
    net.content['l2'] = ConvLayer(net, net.content['l1'])

    net.layer_opts['softmax_norm_dim'] = 1
    net.content['l3']  = SoftmaxLayer(net, net.content['l2'])
    net.content['cost'] = CategoricalCrossEntropy(net, net.content['l3'])

    # Print the network architecture
    net.simpleprint()

    # Initialize learning rate for each updatable layer
    net.InitLR(0.5)

    # Create params list, grad list, momentum list for the theano function to update
    trainer.InitParams(net)
    trainer.opts['validation'] = False
    trainer.opts['test_emp'] = False
    # Update rule
    train_update_rule = trainer.InitUpdateRule(net)
    net.InitTrainFunction(train_update_rule, input, expected_output, ['l3'])
    main_loop = SGDRMainLoop(net)
    main_loop.run(net, trainer)
def train_Attend_224():
    trained_path = '../../data/trained_model/'
    cap_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_capdata_train_%d.h5"
    img_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_imgdata_train_%d.h5"
    val_cap_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_capdata_val_%d.h5"
    val_img_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_imgdata_val_%d.h5"
    fourth_cv_mv = "../../data/mscoco/MSCOCO_processed/4thconvo_meanvar.dat"
    [relu_mean, relu_std] = LoadList(fourth_cv_mv)
    relu_mean = theano.shared(relu_mean.astype(theano.config.floatX))
    relu_std = theano.shared(relu_std.astype(theano.config.floatX))

    # LSTM params
    n_word = 1004
    max_len = 40

    memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
    #print('Memory: %.2f avail before putting train data to shared' % (memory[0]/1024./1024/1024))

    #create net
    net = ShowTellNet()
    net = LoadVGG_Attend(net)
    net.name = "ShowAttendTellCOCO_Re14e-5_deep_out_context_dim_512"
    #net.name = "ShowAttendTellBugFind"
    snapshot_list = glob.glob(trained_path + net.name + '*.dat')

    num_big_epoch = 5000
    big_batch_size = np.asarray([2000], dtype=theano.config.floatX)

    if (len(snapshot_list) == 0):

        # Trainer params
        trainer = Trainer()
        trainer.opts['batch_size'] = 20
        trainer.opts['save'] = False
        trainer.opts['save_freq'] = 2
        #trainer.opts['num_sample'] = num_sample
        #trainer.opts['num_val_sample'] = num_val_sample
        trainer.opts['validation'] = False
        trainer.opts['num_epoch'] = 1
        trainer.opts['dzdw_norm_thres'] = 1
        trainer.opts['dzdb_norm_thres'] = 0.01

        net.layer_opts['updatable'] = True

        # Setting params
        net.net_opts['l1_learning_rate'] = np.asarray(0.005,
                                                      theano.config.floatX)
        net.reset_opts['min_lr'] = np.asarray(0.005,
                                              dtype=theano.config.floatX)
        net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']

        #Constructing LSTM_ATTEND network from image_feature_region step-by-step
        # step 1: net.content['pool4'] reshape to (N, 196, 512) tensor - image_feature_region
        # step 2: using (N, 196, 512) image_feature_region tensor as input to compute h0, c0 - initial state memory of LSTM_ATTEND
        # step 4: construct DeepOutLayer from h_t, z_t output from LSTM_ATTEND layer
        # step 5: using DeepOutLayer output to compute output vector (instead of h_t), then negative log likelihood calculated by SoftMaxLogLoss Layer
        #pdb.set_trace()

        feature_shape = net.content['relu5_3'].output.shape
        new_shape = (feature_shape[0], feature_shape[1],
                     T.prod(feature_shape[2:]))
        #pdb.set_trace()
        #net.content['relu5_3_norm'] = NormLayer(net, net.content['relu5_3'], relu_mean, relu_std)

        net.content['4th_convol_feature_region'] = ReshapeLayer(
            net, net.content['relu5_3'], new_shape)

        # Adding dropout to VGG output
        net.content['4th_convol_feature_region'] = DropOut(
            net, net.content['4th_convol_feature_region'], 0.2)

        net.layer_opts['num_region'] = 196
        net.content['average_feature_region'] = AverageLayer(
            net, net.content['4th_convol_feature_region'], 2)

        # Done
        # avg_out = net.content['average_feature_region'].output.eval({net.input[0]:X.eval()})

        net.layer_opts['num_lstm_node'] = 512
        input_shape_h0 = (1, 512)
        output_shape_h0 = (1, net.layer_opts['num_lstm_node'])
        n_hidden_h0 = 512

        #GENERATING H0
        # net.content['h0_initial'] = MLPLayer(net, net.content['average_feature_region'],
        # 	input_shape = input_shape_h0, output_shape= output_shape_h0,n_hidden= n_hidden_h0)
        net.layer_opts['num_fc_node'] = n_hidden_h0
        net.content['h0_hidden_layer'] = FCLayer(
            net, net.content['average_feature_region'], input_shape_h0, T.tanh)

        net.layer_opts['num_fc_node'] = output_shape_h0[1]
        hidden_shape = (input_shape_h0[1], n_hidden_h0)
        net.content['h0_initial'] = FCLayer(net,
                                            net.content['h0_hidden_layer'],
                                            hidden_shape)

        out_shape = net.content['h0_initial'].output.shape
        net.content['h0_initial'].output = net.content[
            'h0_initial'].output.reshape((-1, out_shape[0], out_shape[1]))

        # h0_init_out =net.content['h0_initial'].output.eval({net.input[0]: X.eval()})

        #GENERATING C0
        # net.content['c0_initial'] = MLPLayer(net, net.content['average_feature_region'],
        # 	input_shape = input_shape_h0, output_shape = output_shape_h0,n_hidden= n_hidden_h0)
        net.layer_opts['num_fc_node'] = n_hidden_h0
        net.content['c0_hidden_layer'] = FCLayer(
            net, net.content['average_feature_region'], input_shape_h0, T.tanh)

        net.layer_opts['num_fc_node'] = output_shape_h0[1]
        net.content['c0_initial'] = FCLayer(net,
                                            net.content['c0_hidden_layer'],
                                            hidden_shape)

        out_shape = net.content['c0_initial'].output.shape
        net.content['c0_initial'].output = net.content[
            'c0_initial'].output.reshape((-1, out_shape[0], out_shape[1]))

        #Word Embedding Layer
        net.layer_opts['num_emb'] = 400
        net.content['we'] = WordEmbLayer(
            net, net.content['input_sen'],
            (trainer.opts['batch_size'], max_len - 1, n_word, 1))

        we_shape = net.content['we'].output.shape
        net.content['we'].output = net.content['we'].output.reshape(
            (we_shape[0], we_shape[1], we_shape[2], -we_shape[3]))
        net.content['we_dropout'] = DropOut(net, net.content['we'], 0.1)

        net.layer_opts['num_lstm_node'] = 512  #
        net.layer_opts['context_dim'] = 512
        net.layer_opts['num_dimension_feature'] = 512
        net.layer_opts['num_region'] = 196

        net.content['4th_convol_feature_region'].output = T.transpose(
            net.content['4th_convol_feature_region'].output, (0, 2, 1))

        # X = np.zeros((2,3,224,224),dtype=np.float32)
        # Y = np.zeros((2,max_len,n_word,1), dtype=np.float32)
        # im_f_feature = net.content['4th_convol_feature_region'].output.eval({
        #     net.input[0]:X
        #     })
        # we_out = net.content['we'].output.eval({net.input[1]:Y})
        # pdb.set_trace()
        net.content['lstm_attend'] = LSTM_Attend(
            net,
            net.content['we_dropout'],
            (trainer.opts['batch_size'], max_len - 1,
             net.layer_opts['num_emb'], 1),
            net.content['4th_convol_feature_region'].output,
            initial_h0=net.content['h0_initial'].output,
            initial_c0=net.content['c0_initial'].output)
        #we_out = we_out, f_region=f_region)

        net.layer_opts[
            'num_deep_out_node'] = 400  #the same number with word embedding layer
        net.layer_opts["n_word"] = n_word
        net.content['deep_out_layer'] = DeepOutputLayer(
            net, net.content['we_dropout'], net.content['lstm_attend'])

        net.layer_opts['num_affine_node'] = n_word
        net.layer_opts['l2_term'] = 0.000014
        net.content['l2'] = L2WeightDecay(net, net.content['deep_out_layer'])

        net.layer_opts['softmax_norm_dim'] = 2
        net.content['smloss'] = SoftmaxLogLoss(net,
                                               net.content['deep_out_layer'])

        net.content['cost'] = AggregateSumLoss(
            [net.content['l2'], net.content['smloss']])

        net.InitLR(0.2)
        memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
        print('Memory: %.2f avail before initialize params' %
              (memory[0] / 1024. / 1024 / 1024))

        trainer.InitParams(net)
        print("Done init params")
        train_update_rule = trainer.InitUpdateRule(net)
        print("Done init update rule")
        additional_output = ['deep_out_layer', 'l2']

        # net.InitValFunction([val_X, val_Y[:,:-1,:,:]], val_Y[:,1:,:,:],
        # 	additional_output, val_weight, net.content['lstm_attend'].output_z)
        e = 0
        last_big_e = 0
    else:
        snapshot_list = sorted(snapshot_list)
        print('Loading latest snapshot at %s' % snapshot_list[-1])
        e = 0
        [net, trainer, last_big_e] = LoadList(snapshot_list[-1])

        net.layer_opts['l2_term'] = 0.000014
        net.content['l2'] = L2WeightDecay(net, net.content['deep_out_layer'])

        net.content['cost'] = AggregateSumLoss(
            [net.content['l2'], net.content['smloss']])
        net.InitLR(0.2)
        memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
        print('Memory: %.2f avail before initialize params' %
              (memory[0] / 1024. / 1024 / 1024))

        trainer.InitParams(net)
        print("Done init params")
        train_update_rule = trainer.InitUpdateRule(net)
        print("Done init update rule")
        additional_output = ['deep_out_layer', 'l2']
    for big_e in range(last_big_e + 1, num_big_epoch):
        # Load train data
        h_list = range(11)
        np.random.shuffle(h_list)
        for h in h_list:
            #break
            #if (not ('train_X' in locals())):
            train_X = LoadH5(img_data_path % h)
            dict_key = train_X.keys()[0]
            train_X = train_X[dict_key]
            num_sample = train_X.shape[0]
            # train_Y has the shape of (num_sample, 5, max_len, n_word, 1)
            train_Y = LoadH5(cap_data_path % h)
            dict_key = train_Y.keys()[0]
            train_Y = train_Y[dict_key]
            Y_shape = train_Y.shape

            # For debugging
            #train_X = train_X[0:100,:,:,:]
            #train_Y = train_Y[0:100,:,:,:,:]
            #num_sample = 100

            #train_Y = train_Y.reshape(5*num_sample, Y_shape[2], Y_shape[3], 1)
            #random_caption_idx = net.net_opts['rng'].randint(0,5,num_sample) + np.asarray([i*5 for i in range(num_sample)])

            # Each image has 5 captions, pick one at random
            #train_Y = train_Y[random_caption_idx, :, :, :]
            train_Y = train_Y[:, 0, :, :, :]
            train_Y = train_Y.astype(theano.config.floatX)

            # Create weight from train_Y
            train_weight = np.copy(train_Y)
            train_weight = train_weight[:, 1:, :, :]
            weight_shape = train_weight.shape
            train_weight = (train_weight[:, :, 0, 0] == 0).reshape(
                weight_shape[0], weight_shape[1], 1, 1)
            train_weight = np.repeat(train_weight, weight_shape[2], 2)
            train_weight = np.repeat(train_weight, weight_shape[3], 3)
            train_weight = train_weight.astype(theano.config.floatX)

            num_big_batch_iteration = np.ceil(
                np.asarray(num_sample, dtype=theano.config.floatX) /
                big_batch_size)

            for j in range(0, num_big_batch_iteration):
                big_batch_range = np.arange(j * big_batch_size,
                                            (j + 1) * big_batch_size)

                if ((j + 1) * big_batch_size > num_sample):
                    big_batch_range = np.arange(j * big_batch_size, num_sample)

                trainer.opts['num_sample'] = big_batch_range.shape[0]
                big_batch_range = np.asarray(big_batch_range, dtype=np.uint32)
                np.random.shuffle(big_batch_range)
                memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info(
                )
                print(
                    'Memory: %.2f avail before putting train data to shared' %
                    (memory[0] / 1024. / 1024 / 1024))

                train_Xj = theano.shared(train_X[big_batch_range, :, :, :])
                train_Yj = theano.shared(train_Y[big_batch_range, :, :, :])
                hash_weight = np.asarray([1.3**t for t in range(max_len)])
                hash_value = np.sum(
                    np.argmax(train_Yj[0, :, :, 0].eval(), axis=1) *
                    hash_weight)
                print(hash_value)
                #pdb.set_trace()

                train_weightj = theano.shared(
                    train_weight[big_batch_range, :, :, :])
                memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info(
                )
                print('Memory: %.2f avail after' %
                      (memory[0] / 1024. / 1024 / 1024))

                #val_Xtest = train_Xj.eval()[0:2,:,:,:]
                #val_Ytest = train_Yj.eval()[0:2,:-1,:,:]
                #z_m1_dummy = np.zeros((1, 2, net.content['lstm_attend'].Z_shape[0]), dtype=theano.config.floatX)
                #pdb.set_trace()
                #relu5_3norm = net.content['relu5_3_norm'].output.eval({net.input[0]: val_Xtest})
                #relu5_3 = net.content['relu5_3'].output.eval({net.input[0]: val_Xtest})
                #h_out = net.content['lstm_attend'].output.eval({
                #    net.input[0]: val_Xtest,
                #    net.input[1]: val_Ytest,
                #    #net.content['lstm_attend'].z_m1_sym
                #    })
                #z_out = net.content['lstm_attend'].output_z.eval({
                #    net.input[0]: val_Xtest,
                #    net.input[1]: val_Ytest,
                #    #net.content['lstm_attend'].z_m1_sym
                #    })
                #c_out = net.content['lstm_attend'].output_c.eval({
                #    net.input[0]: val_Xtest,
                #    net.input[1]: val_Ytest,
                #    #net.content['lstm_attend'].z_m1_sym
                #    })
                #deep_out0 = net.content['deep_out_layer'].output.eval({ \
                #    net.input[0]: val_Xtest, \
                #    net.input[1]: val_Ytest, \
                #    net.content['lstm_attend'].z_m1_sym: z_m1_dummy \
                #})

                #fourth_cv_out = net.content['4th_convol_feature_region'].output.eval({\
                #        net.input[0]: val_Xtest, \
                #})

                #avg_feature = net.content['average_feature_region'].output.eval({\
                #        net.input[0]: val_Xtest, \
                #})
                #
                #h0_init = net.content['h0_initial'].output.eval({\
                #        net.input[0]: val_Xtest
                #        })

                #img_out = net.content['lstm_attend'].img_out.eval({\
                #        net.input[0]: val_Xtest,\
                #        })
                #pdb.set_trace()
                net.InitTrainFunction(train_update_rule,
                                      [train_Xj, train_Yj[:, :-1, :, :]],
                                      train_Yj[:, 1:, :, :], additional_output,
                                      train_weightj)
                print("Done init train function")

                print("start training")
                trainer.opts['validation'] = False
                trainer.opts['train'] = True
                main_loop = SGDRMainLoop(net, trained_path)
                main_loop.run(net, trainer, e)

                del train_Xj
                del train_Yj
                del train_weightj
                del net.train_function

                train_Xj = None
                train_Yj = None
                train_weightj = None
                net.train_function = None
                print('Finished iteration %d, h5 %d, of big epoch %d' %
                      (j, h, big_e))
                plt.figure()
                plt.plot(trainer.all_i[-1000::5])
                plt.savefig('SAT14e-5_all_i_last1000.png')

                plt.close()

                plt.figure()
                plt.plot(trainer.all_i)
                plt.savefig('SAT14e-5_all_i.png')
                plt.close()

            if (big_e % trainer.opts['save_freq'] == 0):
                net1 = net.NNCopy()
                SaveList([net1, trainer, big_e],
                         '../../data/trained_model/%s_e-%05d.dat' %
                         (net.name, big_e))

        # Validating frequency is the same with save freq
        if (big_e % trainer.opts['save_freq'] == 0):
            for h in range(2):  # Max is 6
                val_X = LoadH5(val_img_data_path % h)
                dict_key = val_X.keys()[0]
                val_X = val_X[dict_key]
                num_val_sample = val_X.shape[0]

                # val_Y has the shape of (num_val_sample, 5, max_len, n_word, 1)
                val_Y = LoadH5(val_cap_data_path % h)

                dict_key = val_Y.keys()[0]
                val_Y = val_Y[dict_key]
                Y_shape = val_Y.shape
                val_Y = val_Y.reshape(5 * num_val_sample, Y_shape[2],
                                      Y_shape[3], 1)

                random_caption_idx = net.net_opts['rng'].randint(
                    0, 5, num_val_sample) + np.asarray(
                        [i * 5 for i in range(num_val_sample)])
                # Each image has 5 captions, pick one at random
                val_Y = val_Y[random_caption_idx, :, :, :]
                val_Y = val_Y.astype(theano.config.floatX)
                # Create weight from val_Y
                val_weight = np.copy(val_Y)
                val_weight = val_weight[:, 1:, :, :]
                weight_shape = val_weight.shape
                val_weight = (val_weight[:, :, 0, 0] == 0).reshape(
                    weight_shape[0], weight_shape[1], 1, 1)
                val_weight = np.repeat(val_weight, weight_shape[2], 2)
                val_weight = np.repeat(val_weight, weight_shape[3], 3)
                val_weight = val_weight.astype(theano.config.floatX)

                num_big_batch_iteration = np.ceil(
                    np.asarray(num_val_sample, dtype=theano.config.floatX) /
                    big_batch_size)

                for j in range(0, num_big_batch_iteration):
                    big_batch_range = np.arange(j * big_batch_size,
                                                (j + 1) * big_batch_size)

                    if ((j + 1) * big_batch_size > num_val_sample):
                        big_batch_range = np.arange(j * big_batch_size,
                                                    num_val_sample)

                    trainer.opts['num_val_sample'] = big_batch_range.shape[0]
                    big_batch_range = np.asarray(big_batch_range,
                                                 dtype=np.uint32)
                    memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info(
                    )
                    print(
                        'Memory: %.2f avail before putting val data to shared'
                        % (memory[0] / 1024. / 1024 / 1024))
                    val_Xj = theano.shared(val_X[big_batch_range, :, :, :])
                    val_Yj = theano.shared(val_Y[big_batch_range, :, :, :])

                    hash_weight = np.asarray([1.3**t for t in range(max_len)])
                    hash_value = np.sum(
                        np.argmax(val_Yj[0, :, :, 0].eval(), axis=1) *
                        hash_weight)
                    print(hash_value)
                    val_weightj = theano.shared(
                        val_weight[big_batch_range, :, :, :])

                    memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info(
                    )
                    print('Memory: %.2f avail after' %
                          (memory[0] / 1024. / 1024 / 1024))

                    net.InitValFunction([val_Xj, val_Yj[:, :-1, :, :]],
                                        val_Yj[:, 1:, :, :], additional_output,
                                        val_weightj)
                    print("Done init val function")

                    print("start validating")
                    trainer.opts['validation'] = True
                    trainer.opts['train'] = False
                    main_loop = SGDRMainLoop(net, trained_path)
                    main_loop.run(net, trainer, e)

                    del val_Xj
                    del val_Yj
                    del val_weightj
                    del net.val_function

                    val_Xj = None
                    val_Yj = None
                    val_weightj = None
                    net.val_function = None
                    print(
                        'Finished validating at iteration %d, h5 %d, of big epoch %d'
                        % (j, h, big_e))
def train_Attend_224():
    trained_path = '../../data/trained_model/'
    # LSTM params
    n_word = 2000
    max_len = 40

    train_X, train_Y, train_weight, val_X, val_Y, val_weight = CreateDataFlick224(
        n_word)

    pdb.set_trace()
    #create net
    net = ShowTellNet()
    net.name = "ShowAttendTell"
    snapshot_list = glob.glob(trained_path + net.name + '*.dat')

    X = train_X[0:2, :, :, :]
    Y = train_Y[0:2, :, :, :]
    input_Y = train_Y[:, :-1, :, :]
    expected_Y = train_Y[:, 1:, :, :]
    weight = train_weight[0:2, :, :, :]

    num_sample = 6000
    num_big_epoch = 100
    big_batch_size = np.asarray([2000], dtype=theano.config.floatX)
    num_big_batch_iteration = np.ceil(
        np.asarray(num_sample, dtype=theano.config.floatX) / big_batch_size)

    if (len(snapshot_list) == 0):

        # Trainer params
        trainer = Trainer()
        trainer.opts['batch_size'] = 20
        trainer.opts['save'] = False
        trainer.opts['save_freq'] = 20
        trainer.opts['num_sample'] = 2000
        trainer.opts['num_val_sample'] = 1000
        trainer.opts['validation'] = False
        trainer.opts['num_epoch'] = 1
        trainer.opts['dzdw_norm_thres'] = 1
        trainer.opts['dzdb_norm_thres'] = 0.01

        net = LoadVGG_Attend(net)
        net.layer_opts['updatable'] = True

        # Setting params
        net.net_opts['l1_learning_rate'] = np.asarray(0.005,
                                                      theano.config.floatX)
        net.reset_opts['min_lr'] = np.asarray(0.005,
                                              dtype=theano.config.floatX)
        net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']

        #Constructing LSTM_ATTEND network from image_feature_region step-by-step
        # step 1: net.content['pool4'] reshape to (N, 196, 512) tensor - image_feature_region
        # step 2: using (N, 196, 512) image_feature_region tensor as input to compute h0, c0 - initial state memory of LSTM_ATTEND
        # step 3: construct LSTM_ATTEND from h0, c0 (kwargs) and (N, 196, 512) image_feature_region tensor
        # step 4: construct DeepOutLayer from h_t, z_t output from LSTM_ATTEND layer
        # step 5: using DeepOutLayer output to compute output vector (instead of h_t), then negative log likelihood calculated by SoftMaxLogLoss Layer

        feature_shape = net.content['relu5_3'].output.shape
        new_shape = (feature_shape[0], feature_shape[1],
                     T.prod(feature_shape[2:]))
        net.content['4th_convol_feature_region'] = ReshapeLayer(
            net, net.content['relu5_3'],
            new_shape)  #net.content['pool4'].output.reshape()

        # Done
        # pdb.set_trace()
        # convol_out = net.content['4th_convol_feature_region'].output.eval({net.input[0]: X.eval()})
        # pdb.set_trace()

        net.layer_opts['num_region'] = 196
        # pdb.set_trace()
        net.content['average_feature_region'] = AverageLayer(
            net, net.content['4th_convol_feature_region'], 2)

        # Done
        # avg_out = net.content['average_feature_region'].output.eval({net.input[0]:X.eval()})

        net.layer_opts['num_lstm_node'] = 512
        input_shape_h0 = (1, 512)
        output_shape_h0 = (1, net.layer_opts['num_lstm_node'])
        n_hidden_h0 = 512

        #GENERATING H0
        # net.content['h0_initial'] = MLPLayer(net, net.content['average_feature_region'],
        # 	input_shape = input_shape_h0, output_shape= output_shape_h0,n_hidden= n_hidden_h0)
        net.layer_opts['num_fc_node'] = n_hidden_h0
        net.content['h0_hidden_layer'] = FCLayer(
            net, net.content['average_feature_region'], input_shape_h0, T.tanh)

        net.layer_opts['num_fc_node'] = output_shape_h0[1]
        hidden_shape = (input_shape_h0[1], n_hidden_h0)
        net.content['h0_initial'] = FCLayer(net,
                                            net.content['h0_hidden_layer'],
                                            hidden_shape)

        out_shape = net.content['h0_initial'].output.shape
        net.content['h0_initial'].output = net.content[
            'h0_initial'].output.reshape((-1, out_shape[0], out_shape[1]))

        # pdb.set_trace()
        # h0_init_out =net.content['h0_initial'].output.eval({net.input[0]: X.eval()})
        # pdb.set_trace()

        #GENERATING C0
        # net.content['c0_initial'] = MLPLayer(net, net.content['average_feature_region'],
        # 	input_shape = input_shape_h0, output_shape = output_shape_h0,n_hidden= n_hidden_h0)
        net.layer_opts['num_fc_node'] = n_hidden_h0
        net.content['c0_hidden_layer'] = FCLayer(
            net, net.content['average_feature_region'], input_shape_h0, T.tanh)

        net.layer_opts['num_fc_node'] = output_shape_h0[1]
        net.content['c0_initial'] = FCLayer(net,
                                            net.content['c0_hidden_layer'],
                                            hidden_shape)

        out_shape = net.content['c0_initial'].output.shape
        net.content['c0_initial'].output = net.content[
            'c0_initial'].output.reshape((-1, out_shape[0], out_shape[1]))

        #Word Embedding Layer
        net.layer_opts['num_emb'] = 512
        net.content['we'] = WordEmbLayer(
            net, net.content['input_sen'],
            (trainer.opts['batch_size'], max_len - 1, n_word, 1))

        # pdb.set_trace()
        # we_out = net.content['we'].output.eval({net.input[1]: Y.eval()})
        # pdb.set_trace()

        net.layer_opts['num_lstm_node'] = 512  #
        net.layer_opts['context_dim'] = 1024
        net.layer_opts['num_dimension_feature'] = 512
        net.layer_opts['num_region'] = 196

        net.content['4th_convol_feature_region'].output = T.transpose(
            net.content['4th_convol_feature_region'].output, (0, 2, 1))

        net.content['lstm_attend'] = LSTM_Attend(
            net,
            net.content['we'], (trainer.opts['batch_size'], max_len - 1,
                                net.layer_opts['num_emb'], 1),
            net.content['4th_convol_feature_region'].output,
            initial_h0=net.content['h0_initial'].output,
            initial_c0=net.content['c0_initial'].output)

        # pdb.set_trace()

        # lstm_out = net.content['lstm_attend'].output.eval({net.input[0]: X.eval(),
        # 	net.input[1]:Y.eval(),
        # 	net.content['lstm_attend'].z_m1_sym: np.zeros((1, 2, net.layer_opts['num_dimension_feature']), dtype=theano.config.floatX)})
        # print(lstm_out[0].shape)
        # print(lstm_out[1].shape)
        # # print(lstm_out[2].shape)
        # pdb.set_trace()

        net.layer_opts['num_deep_out_node'] = 512  #300
        net.layer_opts["n_word"] = n_word
        net.content['deep_out_layer'] = DeepOutputLayer(
            net, net.content['we'], net.content['lstm_attend'])

        # net.layer_opts['num_affine_node'] = n_word
        # net.content['deep_out_layer'] = AffineLayer(net, net.content['lstm_attend'],
        #                                        (trainer.opts['batch_size'],
        #                                         max_len - 1,
        #                                         net.layer_opts['num_lstm_node'],
        #                                         1))

        # pdb.set_trace()
        # deep_out = net.content['deep_out_layer'].output.eval({net.input[0]: X.eval(),
        # 	net.input[1]: Y.eval(),
        # 	net.content['lstm_attend'].z_m1_sym: np.zeros((1, 2, net.layer_opts['num_dimension_feature']), dtype=theano.config.floatX)})

        net.layer_opts['l2_term'] = 0.125
        net.content['l2'] = L2WeightDecay(net, net.content['deep_out_layer'])

        net.layer_opts['softmax_norm_dim'] = 2
        net.content['smloss'] = SoftmaxLogLoss(net,
                                               net.content['deep_out_layer'])

        net.content['cost'] = AggregateSumLoss(
            [net.content['l2'], net.content['smloss']])

        # pdb.set_trace()
        # print(X.eval().shape)
        # print(Y.eval().shape)
        # print(weight.eval().shape)

        # logloss_out = net.content['cost'].output.eval({net.input[0]: X.eval(),
        # 	net.input[1]: input_Y.eval(),
        # 	net.output[0]: expected_Y.eval(),
        # 	net.weight[0]: weight.eval(),
        # 	net.content['lstm_attend'].z_m1_sym: np.zeros((1, 2, net.layer_opts['num_dimension_feature']), dtype=theano.config.floatX)})

        # print("Done creating layer")
        # pdb.set_trace()

        net.InitLR(0.2)
        trainer.InitParams(net)
        print("Done init params")
        train_update_rule = trainer.InitUpdateRule(net)
        print("Done init update rule")
        additional_output = [
            'input_sen', 'deep_out_layer', 'we', 'lstm_attend'
        ]

        # net.InitValFunction([val_X, val_Y[:,:-1,:,:]], val_Y[:,1:,:,:],
        # 	additional_output, val_weight, net.content['lstm_attend'].output_z)
        e = 0
        last_big_e = 0
    else:
        snapshot_list = sorted(snapshot_list)
        print('Loading latest snapshot at %s' % snapshot_list[-1])

    for big_e in range(last_big_e, num_big_epoch):
        for j in range(0, num_big_batch_iteration):
            big_batch_range = np.arange(j * big_batch_size,
                                        (j + 1) * big_batch_size)
            if ((j + 1) * big_batch_size > num_sample):
                big_batch_range = np.arange(j * big_batch_size, num_sample)
            trainer.opts['num_sample'] = big_batch_range.shape[0]
            big_batch_range = np.asarray(big_batch_range, dtype=np.uint32)
            memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
            print('Memory: %.2f avail before putting train data to shared' %
                  (memory[0] / 1024. / 1024 / 1024))
            train_Xj = theano.shared(train_X[big_batch_range, :, :, :])
            train_Yj = theano.shared(train_Y[big_batch_range, :, :, :])
            train_weightj = theano.shared(
                train_weight[big_batch_range, :, :, :])
            memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
            print('Memory: %.2f avail after' %
                  (memory[0] / 1024. / 1024 / 1024))

            net.InitTrainFunction(train_update_rule,
                                  [train_Xj, train_Yj[:, :-1, :, :]],
                                  train_Yj[:, 1:, :, :], additional_output,
                                  train_weightj, net.weight[0])
            print("Done init train function")

            # net.InitValFunction([val_X, val_Y[:,:-1,:,:]], val_Y[:,1:,:,:], additional_output, val_weight)
            # print("Done init val function")

            print("start training")
            trainer.opts['validation'] = False
            trainer.opts['train'] = True
            main_loop = SGDRMainLoop(net, trained_path)
            main_loop.run(net, trainer, e)

            train_Xj = None
            train_Yj = None
            train_weightj = None
            net.train_function = None
            print('Finished iteration %d of big epoch %d' % (j, big_e))
def train():
    trained_path = '../../data/trained_model/'
    # LSTM params
    n_word = 2000
    max_len = 40

    # Create net
    net = ShowTellNet()
    net.name = 'ShowTellCheck'
    #net.name = 'abc'
    # Find latest snapshot
    snapshot_list = glob.glob(trained_path + net.name + '*.dat')

    if (len(snapshot_list) == 0):
        train_X, train_Y, train_weight, val_X, val_Y, val_weight = CreateData(
            n_word)
        #train_X = theano.shared(train_X.eval()[0:200,:,:,:])
        #train_Y = theano.shared(train_Y.eval()[0:200,:,:,:])
        # Trainer params
        trainer = Trainer()
        trainer.opts['batch_size'] = 32
        trainer.opts['save'] = False
        trainer.opts['save_freq'] = 20
        trainer.opts['num_sample'] = 200
        trainer.opts['num_val_sample'] = 1000
        trainer.opts['validation'] = False
        trainer.opts['num_epoch'] = 10000
        trainer.opts['dzdw_norm_thres'] = 1
        trainer.opts['dzdb_norm_thres'] = 0.01
        # Load VGG
        net = LoadVGG(net)
        net.layer_opts['updatable'] = True

        # Setting params
        net.net_opts['l1_learning_rate'] = np.asarray(0.005,
                                                      theano.config.floatX)
        net.reset_opts['min_lr'] = np.asarray(0.005,
                                              dtype=theano.config.floatX)
        net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']

        # Construct the network

        net.layer_opts['num_fc_node'] = 512
        # net.layer_opts['num_fc_node'] = 128
        # net.content['fc6'] = FCLayer(net, net.content['pool5'], (1, 512, 2, 2))
        net.content['fc6'] = FCLayer(net, net.content['pool5'], (1, 512, 4, 4))

        net.content['fc6_swap'] = SwapDim(net, net.content['fc6'], 1, 2)

        net.layer_opts['num_emb'] = 512
        # net.layer_opts['num_emb'] = 128
        net.content['we'] = WordEmbLayer(
            net, net.content['input_sen'],
            (trainer.opts['batch_size'], max_len - 1, n_word, 1))

        net.content['cat'] = Concat(net, net.content['fc6_swap'],
                                    net.content['we'], 1)

        net.layer_opts['num_lstm_node'] = n_word
        net.content['lstm'] = LSTM(net, net.content['cat'],
                                   (trainer.opts['batch_size'], max_len - 1,
                                    net.layer_opts['num_emb'], 1))

        ################
        # TESTING LSTM #
        ################

        # h_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX)
        # c_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX)
        # h_dummy2 = np.zeros((1, 2, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX)
        # c_dummy2 = np.zeros((1, 2, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX)
        h_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']),
                            dtype=theano.config.floatX)
        c_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']),
                            dtype=theano.config.floatX)

        # cat = net.content['cat'].output.eval({net.input[0]:X , net.input[1]: Y})
        # cat = np.reshape(cat, (2, 41, 128))
        # cat0 = np.reshape(cat[1,0,:], (1,1,128))
        # cat1 = np.reshape(cat[1,1,:], (1,1,128))
        # cat2 = np.reshape(cat[1,2,:], (1,1,128))
        #
        # x0 = cat[0,0,:].reshape(1,1,128)
        # x1 = cat[0,1,:].reshape(1,1,128)
        # x2 = cat[0,2,:].reshape(1,1,128)

        # Wi = net.content['lstm'].W['i'].eval()
        # Wf = net.content['lstm'].W['f'].eval()
        # Wc = net.content['lstm'].W['c'].eval()
        # Wo = net.content['lstm'].W['o'].eval()
        #
        # Ui = net.content['lstm'].U['i'].eval()
        # Uf = net.content['lstm'].U['f'].eval()
        # Uc = net.content['lstm'].U['c'].eval()
        # Uo = net.content['lstm'].U['o'].eval()
        #
        # bi = net.content['lstm'].b['i'].eval()
        # bf = net.content['lstm'].b['f'].eval()
        # bc = net.content['lstm'].b['c'].eval()
        # bo = net.content['lstm'].b['o'].eval()
        # hm1 = h_dummy
        # cm1 = c_dummy
        #
        # # First iteration
        # i0 = npsigmoid(np.dot(x0, Wi) + np.dot(hm1, Ui) + bi)
        # f0 = npsigmoid(np.dot(x0, Wf) + np.dot(hm1, Uf) + bf)
        # o0 = npsigmoid(np.dot(x0, Wo) + np.dot(hm1, Uo) + bo)
        # c0 = f0*cm1 + i0*np.tanh(np.dot(x0, Wc) + np.dot(hm1, Uc) + bc)
        # h0 = o0*c0
        #
        # # 2nd iteration
        # i1 = npsigmoid(np.dot(x1, Wi) + np.dot(h0, Ui) + bi)
        # f1 = npsigmoid(np.dot(x1, Wf) + np.dot(h0, Uf) + bf)
        # o1 = npsigmoid(np.dot(x1, Wo) + np.dot(h0, Uo) + bo)
        # c1 = f1 * c0 + i1 * np.tanh(np.dot(x1, Wc) + np.dot(h0, Uc) + bc)
        # h1 = o1 * c1
        #
        # i2 = npsigmoid(np.dot(x2, Wi) + np.dot(h1, Ui) + bi)
        # f2 = npsigmoid(np.dot(x2, Wf) + np.dot(h1, Uf) + bf)
        # o2 = npsigmoid(np.dot(x2, Wo) + np.dot(h1, Uo) + bo)
        # c2 = f2 * c1 + i2 * np.tanh(np.dot(x2, Wc) + np.dot(h1, Uc) + bc)
        # h3 = o2 * c2
        # bp = 1
        #
        # h1, c1 = onestep(cat0, h_dummy, c_dummy, net.content['lstm'].W['i'], net.content['lstm'].W['f'],
        #                  net.content['lstm'].W['c'], net.content['lstm'].W['o'],
        #                  net.content['lstm'].U['i'], net.content['lstm'].U['f'], net.content['lstm'].U['c'],
        #                  net.content['lstm'].U['o'],
        #                  net.content['lstm'].b['i'], net.content['lstm'].b['f'], net.content['lstm'].b['c'],
        #                  net.content['lstm'].b['o'])
        #
        # h1 = h1.eval()
        # c1 = c1.eval()
        #
        # h2, c2 = onestep(cat1, h1, c1, net.content['lstm'].W['i'], net.content['lstm'].W['f'],
        #                  net.content['lstm'].W['c'], net.content['lstm'].W['o'],
        #                  net.content['lstm'].U['i'], net.content['lstm'].U['f'], net.content['lstm'].U['c'],
        #                  net.content['lstm'].U['o'],
        #                  net.content['lstm'].b['i'], net.content['lstm'].b['f'], net.content['lstm'].b['c'],
        #                  net.content['lstm'].b['o'])
        #
        # h2 = h2.eval()
        # c2 = c2.eval()
        #
        # h3, c3 = onestep(cat2, h2, c2, net.content['lstm'].W['i'], net.content['lstm'].W['f'],
        #                  net.content['lstm'].W['c'], net.content['lstm'].W['o'],
        #                  net.content['lstm'].U['i'], net.content['lstm'].U['f'], net.content['lstm'].U['c'],
        #                  net.content['lstm'].U['o'],
        #                  net.content['lstm'].b['i'], net.content['lstm'].b['f'], net.content['lstm'].b['c'],
        #                  net.content['lstm'].b['o'])
        #
        # h3 = h3.eval()
        # c3 = c3.eval()
        #
        # lstm = net.content['lstm'].output.eval({net.input[0]:X, net.input[1]:Y,
        #                                         net.content['lstm'].h_m1_sym: h_dummy2,
        #                                         net.content['lstm'].c_m1_sym: c_dummy2})

        # Remove the first 'word' because it was just image priorcat knowledge, has nothing to do with the actual sentence
        net.content['lstm_r'] = LSTMRemove(net, net.content['lstm'], 1)
        #a = net.content['lstm_r'].output.eval({net.input[1]: train_Y[0:5,0:-1,:,:].eval(),
        #    net.input[0]: train_X[0:5,:,:,:].eval(),
        #        net.content['lstm'].h_m1_sym: h_dummy5,
        #        net.content['lstm'].c_m1_sym: c_dummy5
        #        })
        #print('lstm_r shape:')
        #print(a.shape)
        net.layer_opts['softmax_norm_dim'] = 2
        net.content['softmax'] = SoftmaxLayer(net, net.content['lstm_r'])

        net.content['cost'] = CategoricalCrossEntropy(net,
                                                      net.content['softmax'])

        net.InitLR(0.2)
        trainer.InitParams(net)
        train_update_rule = trainer.InitUpdateRule(net)
        additional_output = ['input_sen', 'lstm_r', 'softmax']
        net.InitTrainFunction(train_update_rule,
                              [train_X, train_Y[:, :-1, :, :]],
                              train_Y[:, 1:, :, :], additional_output,
                              train_weight)
        net.InitValFunction([val_X, val_Y[:, :-1, :, :]], val_Y[:, 1:, :, :],
                            additional_output, val_weight)
        e = 0
    else:
        snapshot_list = sorted(snapshot_list)
        print('Loading latest snapshot at %s' % snapshot_list[-1])
        net, trainer, e = LoadList(snapshot_list[-1])
        trainer.opts['save_freq'] = 10
        print('Finished loading snapshot')

        train_X, train_Y, train_weight, val_X, val_Y, val_weight = CreateData(
            n_word)
        net.net_opts['l1_learning_rate'] = np.asarray(0.00008,
                                                      theano.config.floatX)
        net.reset_opts['min_lr'] = np.asarray(0.00008,
                                              dtype=theano.config.floatX)
        net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']
        net.InitLR(1000)
        trainer.InitParams(net)
        train_update_rule = trainer.InitUpdateRule(net)
        additional_output = ['input_sen', 'lstm_r', 'softmax']

        net.InitTrainFunction(train_update_rule,
                              [train_X, train_Y[:, :-1, :, :]],
                              train_Y[:, 1:, :, :], additional_output,
                              train_weight)
        net.InitValFunction([val_X, val_Y[:, :-1, :, :]], val_Y[:, 1:, :, :],
                            additional_output, val_weight)

    main_loop = SGDRMainLoop(net, trained_path)
    main_loop.run(net, trainer, e)
def train():
    # theano.config.optimizer='fast_compile'

    trainer = Trainer()

    # Setting training params
    trainer.opts['batch_size'] = 100
    trainer.opts['save'] = True
    trainer.opts['save_freq'] = 100
    trainer.opts['num_sample'] = 300000
    trainer.opts['num_epoch'] = 5000
    trainer.opts['train_sentence_length'] = 11
    trainer.opts['test_setence_length'] = 15
    trainer.opts['num_val_sample'] = 1
    trainer.opts['num_test_sample'] = 1
    # Generate data
    num_class = 16
    np.random.seed(13111991)

    x_dim = 32

    train_X, valid_X, test_X, train_Y, valid_Y, test_Y = CreateData(
        x_dim, num_class, trainer)

    # Create a CNN for debugging by fixing a set of real input
    # net = ConvNeuralNet(train_X[1:16,:,:,:].eval())

    # Create a CNN

    net = ShowTellNet()
    net.name = 'lstm_test'
    trained_path = '../../data/trained_model/'
    #trained_path = '/home/kien/data/trained_model/'
    snapshot_list = glob.glob(trained_path + net.name + '*.dat')
    e = -1
    if (len(snapshot_list) == 0):

        net.net_opts['l1_learning_rate'] = np.asarray(
            0.0001, dtype=theano.config.floatX)
        net.reset_opts['min_lr'] = np.asarray(0.00001,
                                              dtype=theano.config.floatX)
        net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']

        net.layer_opts['num_fc_node'] = 32
        net.content['img_emb'] = FCLayer(
            net, net.content['input_img'],
            (1, trainer.opts['train_sentence_length'], x_dim, 1))
        net.content['img_emb_swap'] = SwapDim(net, net.content['img_emb'], 1,
                                              2)
        # Construct the network

        net.layer_opts['num_emb'] = 32
        net.content['word_emb'] = WordEmbLayer(
            net, net.content['input_sen'],
            (trainer.opts['batch_size'],
             trainer.opts['train_sentence_length'] - 1, num_class, 1))

        net.content['cat'] = Concat(net, net.content['img_emb_swap'],
                                    net.content['word_emb'], 1)

        net.layer_opts['num_lstm_node'] = num_class
        net.content['lstm'] = LSTM(net, net.content['cat'],
                                   (trainer.opts['batch_size'],
                                    trainer.opts['train_sentence_length'] - 1,
                                    net.layer_opts['num_emb'], 1))

        net.content['lstm_r'] = LSTMRemove(net, net.content['lstm'], 0)

        #################### DEBUG #######################
        # X = np.reshape(train_X[0:2, :, :, :].eval(), (2, 10, x_dim, 1))
        # Y = np.reshape(train_Y[0:2, :, :, :].eval(), (2, 10, num_class, 1))
        # h_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX)
        # c_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX)
        h_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']),
                            dtype=theano.config.floatX)
        c_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']),
                            dtype=theano.config.floatX)
        # cat = net.content['cat'].output.eval({net.input[0]:X , net.input[1]: Y})
        # cat = np.reshape(cat, (2, 11, x_dim))
        # cat0 = np.reshape(cat[1,0,:], (1,1,x_dim))
        # cat1 = np.reshape(cat[1,1,:], (1,1,x_dim))
        # cat2 = np.reshape(cat[1,2,:], (1,1,x_dim))
        #
        # x0 = cat[0,0,:].reshape(1,1,x_dim)
        # x1 = cat[0,1,:].reshape(1,1,x_dim)
        # x2 = cat[0,2,:].reshape(1,1,x_dim)
        # x3 = cat[0,3,:].reshape(1,1,x_dim)
        # Wi = net.content['lstm'].W['i'].eval()
        # Wf = net.content['lstm'].W['f'].eval()
        # Wc = net.content['lstm'].W['c'].eval()
        # Wo = net.content['lstm'].W['o'].eval()
        #
        # Ui = net.content['lstm'].U['i'].eval()
        # Uf = net.content['lstm'].U['f'].eval()
        # Uc = net.content['lstm'].U['c'].eval()
        # Uo = net.content['lstm'].U['o'].eval()
        #
        # bi = net.content['lstm'].b['i'].eval()
        # bf = net.content['lstm'].b['f'].eval()
        # bc = net.content['lstm'].b['c'].eval()
        # bo = net.content['lstm'].b['o'].eval()
        #
        # hm1 = h_dummy
        # cm1 = c_dummy
        #
        # # First iteration
        # i0 = npsigmoid(np.dot(x0, Wi) + np.dot(hm1, Ui) + bi)
        # f0 = npsigmoid(np.dot(x0, Wf) + np.dot(hm1, Uf) + bf)
        # o0 = npsigmoid(np.dot(x0, Wo) + np.dot(hm1, Uo) + bo)
        # c0 = f0*cm1 + i0*np.tanh(np.dot(x0, Wc) + np.dot(hm1, Uc) + bc)
        # h0 = o0*c0
        #
        # # 2nd iteration
        # i1 = npsigmoid(np.dot(x1, Wi) + np.dot(h0, Ui) + bi)
        # f1 = npsigmoid(np.dot(x1, Wf) + np.dot(h0, Uf) + bf)
        # o1 = npsigmoid(np.dot(x1, Wo) + np.dot(h0, Uo) + bo)
        # c1 = f1 * c0 + i1 * np.tanh(np.dot(x1, Wc) + np.dot(h0, Uc) + bc)
        # h1 = o1 * c1
        #
        # # 3rd iteration
        # i2 = npsigmoid(np.dot(x2, Wi) + np.dot(h1, Ui) + bi)
        # f2 = npsigmoid(np.dot(x2, Wf) + np.dot(h1, Uf) + bf)
        # o2 = npsigmoid(np.dot(x2, Wo) + np.dot(h1, Uo) + bo)
        # c2 = f2 * c1 + i2 * np.tanh(np.dot(x2, Wc) + np.dot(h1, Uc) + bc)
        # h2 = o2 * c2
        #
        # # 4th iteration
        # i3 = npsigmoid(np.dot(x3, Wi) + np.dot(h2, Ui) + bi)
        # f3 = npsigmoid(np.dot(x3, Wf) + np.dot(h2, Uf) + bf)
        # o3 = npsigmoid(np.dot(x3, Wo) + np.dot(h2, Uo) + bo)
        # c3 = f3 * c2 + i3 * np.tanh(np.dot(x3, Wc) + np.dot(h2, Uc) + bc)
        # h3 = o3 * c3
        # bp = 1
        #
        #
        # lstm = net.content['lstm'].output.eval({net.input[0]:X, net.input[1]:Y,
        #                                         net.content['lstm'].h_m1_sym: h_dummy2,
        #                                         net.content['lstm'].c_m1_sym: c_dummy2})

        ####################END DEBUG#####################

        net.layer_opts['softmax_norm_dim'] = 2
        net.content['softmax'] = SoftmaxLayer(net, net.content['lstm_r'])

        net.content['cost'] = CategoricalCrossEntropy(net,
                                                      net.content['softmax'])

        # net.simpleprint()

        net.InitLR(0.01)

        # Create params list, grad list, momentum list for the theano function to update
        trainer.InitParams(net)

        # Update rule
        train_update_rule = trainer.InitUpdateRule(net)
        additional_output = ['input_img', 'word_emb', 'softmax']
        # Clip train_Y before

        net.InitTrainFunction(train_update_rule,
                              [train_X, train_Y[:, :-1, :, :]],
                              train_Y[:, 1:, :, :], additional_output)
        net.InitValFunction([valid_X, valid_Y[:, :-1, :, :]],
                            valid_Y[:, 1:, :, :], additional_output)
    else:
        snapshot_list = sorted(snapshot_list)
        print('Loading latest snapshot at %s' % snapshot_list[-1])
        net, trainer, e = LoadList(snapshot_list[-1])

        # trainer = Trainer()

        # Setting training params
        # trainer.opts['batch_size'] = 100
        # trainer.opts['save'] = True
        # trainer.opts['save_freq'] = 50
        # trainer.opts['num_sample'] = 1000
        # trainer.opts['num_epoch'] = 5000
        # trainer.opts['train_sentence_length'] = 10
        # trainer.opts['test_setence_length'] = 15
        # trainer.opts['num_val_sample'] = 1
        # trainer.opts['num_test_sample'] = 1
        #
        #

        net.net_opts['l1_learning_rate'] = np.asarray(
            0.0001, dtype=theano.config.floatX)
        net.reset_opts['min_lr'] = np.asarray(0.00001,
                                              dtype=theano.config.floatX)
        net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']
        net.InitLR(100)
        trainer.InitParams(net)
        # Create params list, grad list, momentum list for the theano function to update
        train_update_rule = trainer.InitUpdateRule(net)
        additional_output = ['input_img', 'word_emb', 'softmax']

        ###########################
        # net = ShowTellNet()
        # net.name = 'lstm_test'
        #
        # net.net_opts['l1_learning_rate'] = np.asarray(0.0001, dtype=theano.config.floatX)
        # net.reset_opts['min_lr'] = np.asarray(0.00001, dtype=theano.config.floatX)
        # net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate']
        #
        # net.layer_opts['num_fc_node'] = 16
        # net.content['img_emb'] = FCLayer(net, net.content['input_img'], (1, 10, x_dim, 1))
        # net.content['img_emb_swap'] = SwapDim(net, net.content['img_emb'], 1, 2)
        # # Construct the network
        #
        # net.layer_opts['num_emb'] = 16
        # net.content['word_emb'] = WordEmbLayer(net, net.content['input_sen'],
        #                                        (trainer.opts['batch_size'], trainer.opts['train_sentence_length'],
        #                                         num_class, 1))
        #
        # net.content['cat'] = Concat(net, net.content['img_emb_swap'], net.content['word_emb'], 1)
        #
        # net.layer_opts['num_lstm_node'] = num_class
        # net.content['lstm'] = LSTM(net, net.content['cat'],
        #                            (trainer.opts['batch_size'], trainer.opts['train_sentence_length'],
        #                             net.layer_opts['num_emb'], 1))
        #
        # net.content['lstm_r'] = LSTMRemove(net, net.content['lstm'], 0, 1)
        #
        # net.layer_opts['softmax_norm_dim'] = 2
        # net.content['softmax'] = SoftmaxLayer(net, net.content['lstm_r'])
        #
        # net.content['cost'] = CategoricalCrossEntropy(net, net.content['softmax'])
        # net.InitLR(100)
        # trainer.InitParams(net)
        # train_update_rule = trainer.InitUpdateRule(net)
        # additional_output = ['input_img', 'word_emb', 'softmax']

        #######################3

        # Create params list, grad list, momentum list for the theano function to update

        # net.train_function = theano.function(
        #     [net.index],
        #     outputs=[net.content['cost'].output] + [net.output[0][net.index, :, :, :]],
        #     updates=None,
        #     givens={
        #         net.input[0]: train_X[net.index, :, :, :],
        #         net.input[1]: train_Y[net.index, :, :, :],
        #         net.output[0]: train_X[net.index, :, :, :],
        #         net.content['lstm'].h_m1_sym: T.zeros((1, net.index.shape[0], net.content['lstm'].W_shape[1]),
        #                                               dtype=theano.config.floatX),
        #         net.content['lstm'].c_m1_sym: T.zeros((1, net.index.shape[0], net.content['lstm'].W_shape[1]),
        #                                               dtype=theano.config.floatX)
        #
        #     }
        #
        # )
        net.InitTrainFunction(train_update_rule, [train_X, train_Y], train_Y,
                              additional_output)
        net.InitValFunction([valid_X, valid_Y], valid_Y, additional_output)

    main_loop = SGDRMainLoop(net, trained_path)
    main_loop.run(net, trainer, e)

    a = 2