Exemplo n.º 1
0
def plotdecodeimages():

    dataset = FLAGS.train_file.split('/')[-1]
    dataset = dataset.split('.')[0]

    s = FLAGS.init_checkpoint.split('/')[-1]
    name = (s.split('_')[1]).split('.ckpt')[0]

    nameac = '{}_{}_{}'.format(dataset, 'Ac', name)
    nameaudio = '{}_{}_{}'.format(dataset, 'Audio', name)
    nameimages = '{}_{}_{}'.format(dataset, 'Video', name)
    data_dirac = str.join('/',
                          FLAGS.init_checkpoint.split('/')[:-1] + [nameac])
    data_diraudio = str.join(
        '/',
        FLAGS.init_checkpoint.split('/')[:-1] + [nameaudio])
    data_dirimages = str.join(
        '/',
        FLAGS.init_checkpoint.split('/')[:-1] + [nameimages])
    num_classes = FLAGS.num_classes
    temporal_pooling = FLAGS.temporal_pooling

    nr_frames = FLAGS.nr_frames
    random_pick = True

    build_spectrogram = (FLAGS.model == 'AudioCoefficients'
                         or FLAGS.model == 'ResNet50'
                         or FLAGS.model == 'HearNet' or FLAGS.model == 'UNet'
                         or FLAGS.model == 'ResNet18_v1')
    normalize = FLAGS.model == 'HearNet'

    # Create data loaders according to the received program arguments
    print('{} - Creating data loaders'.format(datetime.now()))
    modalities = []

    modalities.append(0)
    modalities.append(1)
    modalities.append(2)

    with tf.device('/cpu:0'):
        train_data = ActionsDataLoader(FLAGS.train_file,
                                       'inference',
                                       batch_size=FLAGS.batch_size,
                                       num_epochs=1,
                                       sample_length=1,
                                       datakind='outdoor',
                                       buffer_size=10,
                                       shuffle=False,
                                       normalize=normalize,
                                       build_spectrogram=build_spectrogram,
                                       correspondence=0,
                                       random_pick=random_pick,
                                       modalities=modalities,
                                       nr_frames=FLAGS.nr_frames)
    data_size = train_data.num_samples
    # Build model
    print('{} - Building model'.format(datetime.now()))

    with tf.device('/gpu:0'):

        modelimages = UNet(input_shape=[224, 298, 3])
        modelaudio = UNetSound(input_shape=[99, 257, 1])
        modelac = UNetAc(input_shape=[36, 48, 12])
        # model = UNetE(input_shape=[36, 48, 1])
    handle = tf.placeholder(tf.string, shape=())
    iterator = tf.data.Iterator.from_string_handle(
        handle, train_data.data.output_types, train_data.data.output_shapes)
    train_iterat = train_data.data.make_initializable_iterator()
    next_batch = iterator.get_next()

    logenergy = tf.slice(next_batch[0], [0, 0, 0, 0, 0], [-1, 1, 36, 48, 1])
    logenergy = tf.reshape(logenergy, shape=[-1, 36, 48, 1])
    mfcc = tf.reshape(next_batch[1], shape=[-1, 99, 257, 1])
    images = tf.reshape(next_batch[2], shape=[-1, 224, 298, 3])
    acoustic = tf.reshape(next_batch[0], shape=[-1, 36, 48, 12])

    logenergy = logenergy - tf.reduce_min(
        logenergy, axis=[1, 2], keep_dims=True)
    logenergy = logenergy / tf.reduce_max(
        logenergy, axis=[1, 2], keep_dims=True)

    # mfcc = mfcc - tf.reduce_min(mfcc, axis=[1, 2], keep_dims=True)
    # mfcc = mfcc / tf.reduce_max(mfcc, axis=[1, 2], keep_dims=True)

    if FLAGS.datatype == 'music':
        num_actions = 9
        num_locations = 11  # maximum number of videos for a class
    else:  # self.datakind == 'outdoor':
        num_actions = 10
        num_locations = 61
    num_embedding = 128
    labels = tf.reshape(next_batch[3], shape=[-1, num_actions])
    scenario = tf.reshape(next_batch[4], shape=[-1, num_locations])

    modelac._build_model(acoustic)
    modelaudio._build_model(mfcc)
    modelimages._build_model(images)
    # samples = tf.random_normal([tf.shape(model.variance)[0], tf.shape(model.variance)[1]], 0, 1,
    #                            dtype=tf.float32)
    # guessed_z = model.mean + (model.variance * samples)
    extractedac = modelac.network['features']
    extractedaudio = modelaudio.network['features']
    extractedvideo = modelimages.network['features']
    #FLAGS.model == 'UNet'
    var_listac = slim.get_variables(modelac.scope + '/')
    var_listaudio = slim.get_variables(modelaudio.scope + '/')
    var_listimages = slim.get_variables(modelimages.scope + '/')

    if os.path.exists(data_dirac):
        print("Features already computed!")
    else:
        os.makedirs(
            data_dirac
        )  # mkdir creates one directory, makedirs all intermediate directories

    if os.path.exists(data_diraudio):
        print("Features already computed!")
    else:
        os.makedirs(
            data_diraudio
        )  # mkdir creates one directory, makedirs all intermediate directories

    if os.path.exists(data_dirimages):
        print("Features already computed!")
    else:
        os.makedirs(
            data_dirimages
        )  # mkdir creates one directory, makedirs all intermediate directories

    total_size = 0
    batch_count = 0

    dataset_list_featuresac = np.zeros([data_size, 9, 12, num_embedding],
                                       dtype=float)
    dataset_list_featuresaudio = np.zeros([data_size, 6, 16, num_embedding],
                                          dtype=float)
    dataset_list_featuresimages = np.zeros([data_size, 14, 18, num_embedding],
                                           dtype=float)
    dataset_labels = np.zeros([data_size, num_actions], dtype=int)
    dataset_scenario = np.zeros([data_size, num_locations], dtype=int)
    print('{} - Starting'.format(datetime.now()))

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True))) as session:
        train_handle = session.run(train_iterat.string_handle())
        saver = tf.train.Saver(var_list=var_listac + var_listaudio +
                               var_listimages)
        saver.restore(session, FLAGS.init_checkpoint)
        print('{} - Done'.format(datetime.now()))
        #variables_in_checkpoint = tf.train.list_variables('path.ckpt')
        session.run(train_iterat.initializer)
        while True:
            try:
                labels_data, scenario_data, featuresac, featuresaudio, featuresimages = session.run(
                    [
                        labels, scenario, extractedac, extractedaudio,
                        extractedvideo
                    ],
                    feed_dict={
                        handle: train_handle,
                        modelac.network['keep_prob']: 1.0,
                        modelac.network['is_training']: 0,
                        modelaudio.network['keep_prob']: 1.0,
                        modelaudio.network['is_training']: 0,
                        modelimages.network['keep_prob']: 1.0,
                        modelimages.network['is_training']: 0
                    })
                batchnum = labels_data.shape[0]
                # copy block of data
                dataset_list_featuresimages[total_size:total_size +
                                            batchnum, :] = featuresimages
                dataset_list_featuresaudio[total_size:total_size +
                                           batchnum, :] = featuresaudio
                dataset_list_featuresac[total_size:total_size +
                                        batchnum, :] = featuresac
                dataset_labels[total_size:total_size +
                               batchnum, :] = labels_data
                dataset_scenario[total_size:total_size +
                                 batchnum, :] = scenario_data
                # increase number of data
                total_size += batchnum
                end_time = datetime.now()
                print('{} samples'.format(total_size))
            except tf.errors.OutOfRangeError:
                break
            batch_count += 1
    print('{}'.format(data_size))
    print('{} - Completed, got {} samples'.format(datetime.now(), total_size))
    np.save('{}/{}_data.npy'.format(data_dirac, dataset),
            dataset_list_featuresac)
    np.save('{}/{}_labels.npy'.format(data_dirac, dataset), dataset_labels)
    np.save('{}/{}_scenario.npy'.format(data_dirac, dataset), dataset_scenario)

    np.save('{}/{}_data.npy'.format(data_diraudio, dataset),
            dataset_list_featuresaudio)
    np.save('{}/{}_labels.npy'.format(data_diraudio, dataset), dataset_labels)
    np.save('{}/{}_scenario.npy'.format(data_diraudio, dataset),
            dataset_scenario)

    np.save('{}/{}_data.npy'.format(data_dirimages, dataset),
            dataset_list_featuresimages)
    np.save('{}/{}_labels.npy'.format(data_dirimages, dataset), dataset_labels)
    np.save('{}/{}_scenario.npy'.format(data_dirimages, dataset),
            dataset_scenario)
def plotdecodeimages():

    dataset = FLAGS.train_file.split('/')[-1]
    dataset = dataset.split('.')[0]

    s = FLAGS.init_checkpoint.split('/')[-1]
    name = (s.split('_')[1]).split('.ckpt')[0]

    name = '{}_{}_{}_{}'.format(FLAGS.model, dataset, 'Ac', name)
    data_dir = str.join('/', FLAGS.init_checkpoint.split('/')[:-1] + [name])
    random_pick = True

    build_spectrogram = (FLAGS.model == 'AudioCoefficients'
                         or FLAGS.model == 'ResNet50'
                         or FLAGS.model == 'HearNet' or FLAGS.model == 'UNet'
                         or FLAGS.model == 'ResNet18_v1')
    normalize = FLAGS.model == 'HearNet'

    # Create data loaders according to the received program arguments
    print('{} - Creating data loaders'.format(datetime.now()))
    modalities = []

    modalities.append(0)
    modalities.append(1)
    modalities.append(2)

    with tf.device('/cpu:0'):
        train_data = ActionsDataLoader(FLAGS.train_file,
                                       'inference',
                                       batch_size=FLAGS.batch_size,
                                       num_epochs=1,
                                       sample_length=1,
                                       datakind='outdoor',
                                       buffer_size=10,
                                       shuffle=False,
                                       normalize=normalize,
                                       build_spectrogram=build_spectrogram,
                                       correspondence=0,
                                       random_pick=random_pick,
                                       modalities=modalities,
                                       nr_frames=FLAGS.nr_frames)
    data_size = train_data.num_samples
    # Build model
    print('{} - Building model'.format(datetime.now()))

    with tf.device('/gpu:0'):

        modelimages = UNet(input_shape=[224, 298, 3])
        modelaudio = UNetSound(input_shape=[99, 257, 1])
        modelac = UNetAc(input_shape=[36, 48, 12])
        if FLAGS.fusion:
            model_associator = JointTwomvae2()
        elif FLAGS.onlyaudiovideo:
            model_associator = JointTwomvae()
        else:
            model_associator = Jointmvae()
    handle = tf.placeholder(tf.string, shape=())
    iterator = tf.data.Iterator.from_string_handle(
        handle, train_data.data.output_types, train_data.data.output_shapes)
    train_iterat = train_data.data.make_initializable_iterator()
    next_batch = iterator.get_next()

    mfcc = tf.reshape(next_batch[1], shape=[-1, 99, 257, 1])
    mfcc = tf.image.resize_bilinear(mfcc, [193, 257], align_corners=False)
    video = tf.reshape(next_batch[2], shape=[-1, 224, 298, 3])
    acoustic = tf.reshape(next_batch[0], shape=[-1, 36, 48, 12])

    # mfcc = mfcc - tf.reduce_min(mfcc, axis=[1, 2], keep_dims=True)
    # mfcc = mfcc / tf.reduce_max(mfcc, axis=[1, 2], keep_dims=True)

    if FLAGS.datatype == 'music':
        num_actions = 9
        num_locations = 11  # maximum number of videos for a class
    else:  # self.datakind == 'outdoor':
        num_actions = 10
        num_locations = 61
    num_embedding = 128
    labels = tf.reshape(next_batch[3], shape=[-1, num_actions])
    scenario = tf.reshape(next_batch[4], shape=[-1, num_locations])
    output = modelac._build_network(acoustic)
    outputvideo = modelimages._build_network(video)
    outputaudio = modelaudio._build_network(mfcc)
    # fuse feature maps and get new feature maps for 3 mod
    if FLAGS.fusion or FLAGS.onlyaudiovideo:
        model_associator._build_model(outputvideo, outputaudio)
    else:
        model_associator._build_model(output, outputvideo, outputaudio)

    if FLAGS.onlyaudiovideo:
        modelac._build_model(model_associator.outputac)
    else:
        modelac._build_model(model_associator.outputac)
        modelaudio._build_model(model_associator.outputaudio)
        modelimages._build_model(model_associator.outputvideo)

    #FLAGS.model == 'UNet'
    var_listac = slim.get_variables(modelac.scope + '/')
    var_listaudio = slim.get_variables(modelaudio.scope + '/')
    var_listimages = slim.get_variables(modelimages.scope + '/')
    var_listassociator = slim.get_variables(model_associator.scope + '/')

    if os.path.exists(data_dir):
        print("Features already computed!")
    else:
        os.makedirs(
            data_dir
        )  # mkdir creates one directory, makedirs all intermediate directories
    num = 0
    total_size = 0
    batch_count = 0

    print('{} - Starting'.format(datetime.now()))

    namesimage = ['Acoustic image', 'Reconstructed']

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True))) as session:
        train_handle = session.run(train_iterat.string_handle())
        saver = tf.train.Saver(var_list=var_listac + var_listaudio +
                               var_listimages + var_listassociator)
        saver.restore(session, FLAGS.init_checkpoint)
        print('{} - Done'.format(datetime.now()))
        #variables_in_checkpoint = tf.train.list_variables('path.ckpt')
        session.run(train_iterat.initializer)
        while True:
            try:
                data, reconstructed = session.run(
                    [acoustic, modelac.output],
                    feed_dict={
                        handle: train_handle,
                        modelac.keep_prob: 1.0,
                        modelac.is_training: 0,
                        modelaudio.keep_prob: 1.0,
                        modelaudio.is_training: 0,
                        modelimages.keep_prob: 1.0,
                        modelimages.is_training: 0
                    })
                batchnum = reconstructed.shape[0]
                # copy block of data
                # increase number of data
                total_size += batchnum
                print('{} samples'.format(total_size))
                for h in range(np.shape(reconstructed)[0]):
                    # original and reconstructed
                    fig, axs = plt.subplots(4, 2, figsize=(6, 2.9 * 4))
                    plt.tight_layout(pad=1.0)
                    fig.suptitle('Reconstructed image')
                    imagesvideo = np.stack((data, reconstructed), 0)
                    for i in range(2):
                        for j in range(4):
                            x = j
                            y = i
                            axs[x, y].imshow(imagesvideo[i, h, :, :,
                                                         j * 3:(j + 1) * 3])
                            axs[x, y].axis('off')
                            axs[x, y].set_title('{}'.format(namesimage[i]))
                    outImage_path = '{}/{}_images_{}.png'.format(
                        data_dir, dataset, num)
                    plt.savefig(outImage_path)
                    plt.clf()
                    num = num + 1
            except tf.errors.OutOfRangeError:
                break
            batch_count += 1
    print('{}'.format(data_size))
    print('{} - Completed, got {} samples'.format(datetime.now(), total_size))
def plotdecodeimages():

    dataset = FLAGS.train_file.split('/')[-1]
    dataset = dataset.split('.')[0]

    s = FLAGS.init_checkpoint.split('/')[-1]
    name = (s.split('_')[1]).split('.ckpt')[0]

    name = '{}_{}_{}_{}'.format(FLAGS.model, dataset, FLAGS.encoder_type, name)
    data_dir = str.join('/', FLAGS.init_checkpoint.split('/')[:-1] + [name])
    num_classes = FLAGS.num_classes
    temporal_pooling = FLAGS.temporal_pooling

    nr_frames = FLAGS.nr_frames
    random_pick = True

    build_spectrogram = (FLAGS.model == 'AudioCoefficients'
                         or FLAGS.model == 'ResNet50'
                         or FLAGS.model == 'HearNet' or FLAGS.model == 'UNet'
                         or FLAGS.model == 'ResNet18_v1')
    normalize = FLAGS.model == 'HearNet'

    # Create data loaders according to the received program arguments
    print('{} - Creating data loaders'.format(datetime.now()))
    modalities = []

    modalities.append(0)
    modalities.append(1)
    modalities.append(2)

    with tf.device('/cpu:0'):
        train_data = ActionsDataLoader(FLAGS.train_file,
                                       'inference',
                                       batch_size=FLAGS.batch_size,
                                       num_epochs=1,
                                       sample_length=1,
                                       datakind='outdoor',
                                       buffer_size=10,
                                       shuffle=False,
                                       normalize=normalize,
                                       build_spectrogram=build_spectrogram,
                                       correspondence=0,
                                       random_pick=random_pick,
                                       modalities=modalities,
                                       nr_frames=FLAGS.nr_frames)

    # Build model
    print('{} - Building model'.format(datetime.now()))

    with tf.device('/gpu:0'):

        if FLAGS.encoder_type == 'Video':
            model = UNet(input_shape=[224, 298, 3])
        elif FLAGS.encoder_type == 'Audio':
            model = UNetSound(input_shape=[99, 257, 1])
        elif FLAGS.encoder_type == 'Ac':
            model = UNetAc(input_shape=[36, 48, 12])
        else:
            model = UNetE(input_shape=[36, 48, 1])
    handle = tf.placeholder(tf.string, shape=())
    iterator = tf.data.Iterator.from_string_handle(
        handle, train_data.data.output_types, train_data.data.output_shapes)
    train_iterat = train_data.data.make_initializable_iterator()
    next_batch = iterator.get_next()

    logenergy = tf.slice(next_batch[0], [0, 0, 0, 0, 0], [-1, 1, 36, 48, 1])
    logenergy = tf.reshape(logenergy, shape=[-1, 36, 48, 1])
    mfcc = tf.reshape(next_batch[1], shape=[-1, 99, 257, 1])
    mfcc = tf.image.resize_bilinear(mfcc, [193, 257], align_corners=False)
    images = tf.reshape(next_batch[2], shape=[-1, 224, 298, 3])
    acoustic = tf.reshape(next_batch[0], shape=[-1, 36, 48, 12])

    logenergy = logenergy - tf.reduce_min(
        logenergy, axis=[1, 2], keep_dims=True)
    logenergy = logenergy / tf.reduce_max(
        logenergy, axis=[1, 2], keep_dims=True)

    # mfcc = mfcc - tf.reduce_min(mfcc, axis=[1, 2], keep_dims=True)
    # mfcc = mfcc / tf.reduce_max(mfcc, axis=[1, 2], keep_dims=True)

    if FLAGS.encoder_type == 'Video':
        considered_modality = images
    elif FLAGS.encoder_type == 'Audio':
        considered_modality = mfcc
    elif FLAGS.encoder_type == 'Ac':
        considered_modality = acoustic
    else:
        considered_modality = logenergy

    model._build_model(considered_modality)
    #FLAGS.model == 'UNet'
    output = model.output
    var_list2 = slim.get_variables(model.scope + '/')

    if os.path.exists(data_dir):
        print("Features already computed!")
    else:
        os.makedirs(
            data_dir
        )  # mkdir creates one directory, makedirs all intermediate directories

    total_size = 0
    batch_count = 0
    num = 0
    print('{} - Starting'.format(datetime.now()))

    if FLAGS.encoder_type == 'Video':
        namesimage = ['RGB', 'Reconstructed']
    elif FLAGS.encoder_type == 'Audio':
        namesimage = ['Spectrogram', 'Reconstructed']
    elif FLAGS.encoder_type == 'Ac':
        namesimage = ['Acoustic image', 'Reconstructed']
    else:
        namesimage = ['Energy', 'Reconstructed']

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True))) as session:
        train_handle = session.run(train_iterat.string_handle())
        # Initialize student model
        if FLAGS.init_checkpoint is None:
            print('{} - Initializing student model'.format(datetime.now()))
            model.init_model(session, FLAGS.init_checkpoint)
            print('{} - Done'.format(datetime.now()))
        else:
            print('{} - Restoring student model'.format(datetime.now()))
            saver = tf.train.Saver(var_list=var_list2)
            saver.restore(session, FLAGS.init_checkpoint)
            print('{} - Done'.format(datetime.now()))
            #variables_in_checkpoint = tf.train.list_variables('path.ckpt')
        session.run(train_iterat.initializer)
        if FLAGS.encoder_type == 'Audio' or FLAGS.encoder_type == 'Energy':
            while True:
                try:
                    data, reconstructed = session.run(
                        [considered_modality, output],
                        feed_dict={
                            handle: train_handle,
                            model.network['keep_prob']: 1.0,
                            model.network['is_training']: 0
                        })
                    total_size += reconstructed.shape[0]

                    for h in range(np.shape(reconstructed)[0]):
                        # original and reconstructed
                        fig, axs = plt.subplots(1, 2, figsize=(6, 2.9))
                        plt.tight_layout(pad=1.0)
                        fig.suptitle('Reconstructed image')
                        imagesvideo = np.stack((data, reconstructed), 0)
                        for i in range(2):
                            x = 0
                            y = i
                            axs[y].imshow(imagesvideo[i, h, :, :, 0])
                            axs[y].axis('off')
                            axs[y].set_title('{}'.format(namesimage[i]))
                        outImage_path = '{}/{}_images_{}.png'.format(
                            data_dir, dataset, num)
                        plt.savefig(outImage_path)
                        plt.clf()
                        num = num + 1
                    print(total_size)
                except tf.errors.OutOfRangeError:
                    break
                batch_count += 1
                print('{} - Completed, got {} samples'.format(
                    datetime.now(), total_size))
        elif FLAGS.encoder_type == 'Video':
            while True:
                try:
                    data, reconstructed = session.run(
                        [considered_modality, output],
                        feed_dict={
                            handle: train_handle,
                            model.network['keep_prob']: 1.0,
                            model.network['is_training']: 0
                        })
                    total_size += reconstructed.shape[0]

                    for h in range(np.shape(reconstructed)[0]):
                        # original and reconstructed
                        fig, axs = plt.subplots(1, 2, figsize=(6, 2.9))
                        plt.tight_layout(pad=1.0)
                        fig.suptitle('Reconstructed image')
                        imagesvideo = np.stack((data, reconstructed), 0)
                        for i in range(2):
                            x = 0
                            y = i
                            axs[y].imshow(imagesvideo[i, h, :, :, :])
                            axs[y].axis('off')
                            axs[y].set_title('{}'.format(namesimage[i]))
                        outImage_path = '{}/{}_images_{}.png'.format(
                            data_dir, dataset, num)
                        plt.savefig(outImage_path)
                        plt.clf()
                        num = num + 1
                    print(total_size)
                except tf.errors.OutOfRangeError:
                    break
                batch_count += 1
                print('{} - Completed, got {} samples'.format(
                    datetime.now(), total_size))
        else:
            while True:
                try:
                    data, reconstructed = session.run(
                        [considered_modality, output],
                        feed_dict={
                            handle: train_handle,
                            model.network['keep_prob']: 1.0,
                            model.network['is_training']: 0
                        })
                    total_size += reconstructed.shape[0]

                    for h in range(np.shape(reconstructed)[0]):
                        # original and reconstructed
                        fig, axs = plt.subplots(4, 2, figsize=(6, 2.9 * 4))
                        plt.tight_layout(pad=1.0)
                        fig.suptitle('Reconstructed image')
                        imagesvideo = np.stack((data, reconstructed), 0)
                        for i in range(2):
                            for j in range(4):
                                x = j
                                y = i
                                axs[x,
                                    y].imshow(imagesvideo[i, h, :, :,
                                                          j * 3:(j + 1) * 3])
                                axs[x, y].axis('off')
                                axs[x, y].set_title('{}'.format(namesimage[i]))
                        outImage_path = '{}/{}_images_{}.png'.format(
                            data_dir, dataset, num)
                        plt.savefig(outImage_path)
                        plt.clf()
                        num = num + 1
                    print(total_size)
                except tf.errors.OutOfRangeError:
                    break
                batch_count += 1
                print('{} - Completed, got {} samples'.format(
                    datetime.now(), total_size))
def plotdecodeimages():
    encoder_type = FLAGS.encoder_type
    dataset = FLAGS.train_file.split('/')[-1]
    dataset = dataset.split('.')[0]

    s = FLAGS.init_checkpoint.split('/')[-1]
    name = (s.split('_')[1]).split('.ckpt')[0]
    if FLAGS.fusion:
        name2 = '{}_Ac{}_{}'.format(dataset, 'VideoAudio', name)
    else:
        name2 = '{}_Ac{}_{}'.format(dataset, encoder_type, name)
    data_dir = str.join('/', FLAGS.init_checkpoint.split('/')[:-1] + [name2])

    random_pick = True

    build_spectrogram = True
    normalize = False

    # Create data loaders according to the received program arguments
    print('{} - Creating data loaders'.format(datetime.now()))
    modalities = []

    modalities.append(0)
    modalities.append(1)
    modalities.append(2)

    with tf.device('/cpu:0'):
        train_data = ActionsDataLoader(FLAGS.train_file,
                                       'inference',
                                       batch_size=FLAGS.batch_size,
                                       num_epochs=1,
                                       sample_length=1,
                                       datakind='outdoor',
                                       buffer_size=10,
                                       shuffle=False,
                                       normalize=normalize,
                                       build_spectrogram=build_spectrogram,
                                       correspondence=0,
                                       random_pick=random_pick,
                                       modalities=modalities,
                                       nr_frames=FLAGS.nr_frames)
    data_size = train_data.num_samples
    # Build model
    print('{} - Building model'.format(datetime.now()))
    handle = tf.placeholder(tf.string, shape=())
    iterator = tf.data.Iterator.from_string_handle(
        handle, train_data.data.output_types, train_data.data.output_shapes)
    train_iterat = train_data.data.make_initializable_iterator()
    next_batch = iterator.get_next()

    # logenergy = tf.slice(next_batch[0], [0, 0, 0, 0, 0], [-1, 1, 36, 48, 1])
    # logenergy = tf.reshape(logenergy, shape=[-1, 36, 48, 1])
    mfcc = tf.reshape(next_batch[1], shape=[-1, 99, 257, 1])
    images = tf.reshape(next_batch[2], shape=[-1, 224, 298, 3])
    acoustic = tf.reshape(next_batch[0], shape=[-1, 36, 48, 12])

    # logenergy = logenergy - tf.reduce_min(logenergy, axis=[1, 2], keep_dims=True)
    # logenergy = logenergy / tf.reduce_max(logenergy, axis=[1, 2], keep_dims=True)

    # mfcc = mfcc - tf.reduce_min(mfcc, axis=[1, 2], keep_dims=True)
    # mfcc = mfcc / tf.reduce_max(mfcc, axis=[1, 2], keep_dims=True)

    if FLAGS.datatype == 'music':
        num_actions = 9
        num_locations = 11  # maximum number of videos for a class
    else:  # self.datakind == 'outdoor':
        num_actions = 10
        num_locations = 61
    num_embedding = 128
    labels = tf.reshape(next_batch[3], shape=[-1, num_actions])
    scenario = tf.reshape(next_batch[4], shape=[-1, num_locations])

    with tf.device('/gpu:0'):
        modelac = UNetAc(input_shape=[36, 48, 12])
        if FLAGS.fusion:
            modelimages = UNet(input_shape=[224, 298, 3])
            modelimages._build_model(images)
            modelaudio = UNetSound(input_shape=[99, 257, 1])
            modelaudio._build_model(mfcc)
            meanimages = modelimages.mean
            varianceimages = modelimages.variance
            meanaudio = modelaudio.mean
            varianceaudio = modelaudio.variance
            samples = tf.random_normal(
                [tf.shape(varianceimages)[0],
                 tf.shape(varianceimages)[1]],
                0,
                1,
                dtype=tf.float32)
            z = meanimages + meanaudio + (
                (varianceaudio + varianceimages) * samples)
            var_list = slim.get_variables(modelaudio.scope +
                                          '/') + slim.get_variables(
                                              modelimages.scope + '/')
        else:
            if FLAGS.encoder_type == 'Video':
                model = UNet(input_shape=[224, 298, 3])
                model._build_model(images)
            elif FLAGS.encoder_type == 'Audio':
                model = UNetSound(input_shape=[99, 257, 1])
                model._build_model(mfcc)
            mean = model.mean
            variance = model.variance
            samples = tf.random_normal(
                [tf.shape(variance)[0],
                 tf.shape(variance)[1]],
                0,
                1,
                dtype=tf.float32)
            z = mean + (variance * samples)
            var_list = slim.get_variables(model.scope + '/')

        modelac._build_model(acoustic, z)
        output = modelac.output
        var_listac = slim.get_variables(modelac.scope + '/')

    if os.path.exists(data_dir):
        print("Features already computed!")
    else:
        os.makedirs(
            data_dir
        )  # mkdir creates one directory, makedirs all intermediate directories

    total_size = 0
    batch_count = 0
    num = 0
    print('{} - Starting'.format(datetime.now()))

    namesimage = ['Acoustic image', 'Reconstructed']

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True))) as session:
        train_handle = session.run(train_iterat.string_handle())
        saver = tf.train.Saver(var_list=var_listac + var_list)
        saver.restore(session, FLAGS.init_checkpoint)
        print('{} - Done'.format(datetime.now()))
        #variables_in_checkpoint = tf.train.list_variables('path.ckpt')
        session.run(train_iterat.initializer)
        if FLAGS.fusion:
            while True:
                try:
                    data, reconstructed = session.run(
                        [acoustic, output],
                        feed_dict={
                            handle: train_handle,
                            modelac.network['keep_prob']: 1.0,
                            modelac.network['is_training']: 0,
                            modelaudio.network['keep_prob']: 1.0,
                            modelaudio.network['is_training']: 0,
                            modelimages.network['keep_prob']: 1.0,
                            modelimages.network['is_training']: 0
                        })
                    total_size += reconstructed.shape[0]

                    for h in range(np.shape(reconstructed)[0]):
                        # original and reconstructed
                        fig, axs = plt.subplots(4, 2, figsize=(6, 2.9 * 4))
                        plt.tight_layout(pad=1.0)
                        fig.suptitle('Reconstructed image')
                        imagesvideo = np.stack((data, reconstructed), 0)
                        for i in range(2):
                            for j in range(4):
                                x = j
                                y = i
                                axs[x,
                                    y].imshow(imagesvideo[i, h, :, :,
                                                          j * 3:(j + 1) * 3])
                                axs[x, y].axis('off')
                                axs[x, y].set_title('{}'.format(namesimage[i]))
                        outImage_path = '{}/{}_images_{}.png'.format(
                            data_dir, dataset, num)
                        plt.savefig(outImage_path)
                        plt.clf()
                        num = num + 1
                    print('{} samples'.format(total_size))
                except tf.errors.OutOfRangeError:
                    break
                batch_count += 1
        else:
            while True:
                try:
                    data, reconstructed = session.run(
                        [acoustic, output],
                        feed_dict={
                            handle: train_handle,
                            modelac.network['keep_prob']: 1.0,
                            modelac.network['is_training']: 0,
                            model.network['keep_prob']: 1.0,
                            model.network['is_training']: 0
                        })
                    total_size += reconstructed.shape[0]

                    for h in range(np.shape(reconstructed)[0]):
                        # original and reconstructed
                        fig, axs = plt.subplots(4, 2, figsize=(6, 2.9 * 4))
                        plt.tight_layout(pad=1.0)
                        fig.suptitle('Reconstructed image')
                        imagesvideo = np.stack((data, reconstructed), 0)
                        for i in range(2):
                            for j in range(4):
                                x = j
                                y = i
                                axs[x,
                                    y].imshow(imagesvideo[i, h, :, :,
                                                          j * 3:(j + 1) * 3])
                                axs[x, y].axis('off')
                                axs[x, y].set_title('{}'.format(namesimage[i]))
                        outImage_path = '{}/{}_images_{}.png'.format(
                            data_dir, dataset, num)
                        plt.savefig(outImage_path)
                        plt.clf()
                        num = num + 1
                    print('{} samples'.format(total_size))
                except tf.errors.OutOfRangeError:
                    break
                batch_count += 1

    print('{}'.format(data_size))
    print('{} - Completed, got {} samples'.format(datetime.now(), total_size))