예제 #1
0
    def prepare_inputs(self, inputs):
        """Prepares inputs on device to be fed to model in eval mode."""

        params = self.params.eval.input
        video_shape = processing.get_video_shape(params)
        audio_shape = processing.get_audio_shape(params, REF_FPS, REF_SR)

        if FeatureNames.VISION in inputs:
            images = inputs[FeatureNames.VISION]
        else:
            images = tf.zeros([1] + video_shape, dtype=tf.float32)

        if FeatureNames.AUDIO_MEL in inputs or FeatureNames.AUDIO in inputs:
            if params.raw_audio:
                audio = inputs[FeatureNames.AUDIO]
            else:
                audio = inputs[FeatureNames.AUDIO_MEL]
        else:
            audio = tf.zeros([1] + audio_shape, dtype=tf.float32)

        if FeatureNames.TEXT_INDEX in inputs:
            words = inputs[FeatureNames.TEXT_INDEX]
        else:
            words = tf.zeros([1, params.max_num_words], dtype=tf.int32)

        audio = tf.reshape(audio, [-1] + audio_shape)
        words = tf.reshape(words, [-1, words.shape.as_list()[-1]])

        labels_onehot = inputs.get(FeatureNames.LABEL_INDEX, None)

        labels = {'one_hot': labels_onehot}

        inputs = {'video': images, 'audio': audio, 'text': words}

        return inputs, labels
예제 #2
0
    def prepare_eval_inputs(self, inputs):
        """Prepares inputs on device to be fed to model in eval mode."""
        params = self.params.eval.input
        images = inputs[FeatureNames.VISION]
        labels_onehot = inputs[FeatureNames.LABEL_INDEX]

        if params.linearize_vision:
            img_shape = [params.frame_size, params.frame_size, 3]
            if params.name in dataloaders.VID_CLS_DS:
                space_to_depth = params.space_to_depth
                img_shape = processing.get_video_shape(
                    params, is_space_to_depth=space_to_depth)
            else:
                img_shape = [1] + img_shape

            img_shape = [-1] + img_shape
            images = tf.reshape(images, img_shape)

        if params.name in dataloaders.IMG_CLS_DS:
            num_replica = self.params.model_config.temporal_patch_size
            images = tf.tile(images, [1, num_replica, 1, 1, 1])

        labels = {'one_hot': labels_onehot}

        inputs = {'images': images}

        return inputs, labels
예제 #3
0
    def prepare_inputs(self, inputs):
        """Prepares inputs on device to be fed to model in train mode."""

        params = self.params.train.input
        images = inputs[FeatureNames.VISION]
        space_to_depth = params.space_to_depth
        if params.linearize_vision:
            vid_shape = processing.get_video_shape(
                params, is_space_to_depth=space_to_depth)
            images = tf.reshape(images, [-1] + vid_shape)

        if params.raw_audio:
            audio = inputs[FeatureNames.AUDIO]
        else:
            audio = inputs[FeatureNames.AUDIO_MEL]
        words = inputs[FeatureNames.TEXT_INDEX]
        words = tf.reshape(words, [-1, words.shape.as_list()[-1]])

        audio_mask = inputs[FeatureNames.AUDIO_MASK]
        text_mask = inputs[FeatureNames.TEXT_MASK]

        labels = {
            FeatureNames.AUDIO_MASK: audio_mask,
            FeatureNames.TEXT_MASK: text_mask
        }

        inputs = {'video': images, 'audio': audio, 'text': words}
        return inputs, labels
예제 #4
0
    def construct_model(self, params):
        """Build models for train/eval."""

        if params.mode == 'train':
            input_params = params.train.input
            space_to_depth = input_params.space_to_depth
        else:
            input_params = params.eval.input
            space_to_depth = input_params.space_to_depth

        video_shape = processing.get_video_shape(input_params, space_to_depth)
        audio_shape = processing.get_audio_shape(input_params, REF_FPS, REF_SR)
        text_shape = (input_params.max_num_words, )

        inputs = {
            'video': tf.keras.Input(shape=video_shape),
            'audio': tf.keras.Input(shape=audio_shape),
            'text': tf.keras.Input(shape=text_shape),
        }

        model = model_factory.build_model(params.model_config)
        outputs = model(inputs, None)
        keras_model = tf.keras.Model(inputs=inputs, outputs=outputs)
        keras_model.loss_fn = model.loss_fn

        # Restoring word embeddings
        self.restore_text_embeddings(keras_model, params)

        logging.info('Number of parameters in model: %f M.',
                     keras_model.count_params() / 10.**6)

        learning_rate = schedules.get_learning_rate(
            params.train.optimizer.learning_rate)
        keras_model.optimizer = optimizers.get_optimizer(
            learning_rate, params.train.optimizer)
        return keras_model
예제 #5
0
    def construct_model(self, params):
        """Build models for train/eval."""

        num_test_samples = 1
        if params.mode == 'train':
            input_params = params.train.input
            ds_name = input_params.name
            is_vid_cls = ds_name in dataloaders.VID_CLS_DS
            is_img_cls = ds_name in dataloaders.IMG_CLS_DS
            is_aud_cls = ds_name in dataloaders.AUD_CLS_DS

        elif params.mode == 'eval':
            input_params = params.eval.input
            ds_name = input_params.name
            is_vid_cls = ds_name in dataloaders.VID_CLS_DS
            is_img_cls = ds_name in dataloaders.IMG_CLS_DS
            is_aud_cls = ds_name in dataloaders.AUD_CLS_DS
            if not is_img_cls:
                num_test_samples = params.eval.input.num_windows_test
                if params.eval.input.multi_crop and not is_aud_cls:
                    num_test_samples *= 3

        else:
            raise ValueError('Invalid mode!')

        if is_aud_cls:
            input_shape = processing.get_audio_shape(input_params, REF_FPS,
                                                     REF_SR)
        elif is_vid_cls:
            space_to_depth = input_params.space_to_depth
            input_shape = processing.get_video_shape(
                input_params, is_space_to_depth=space_to_depth)
        elif is_img_cls:
            input_shape = processing.get_video_shape(input_params)

        if is_img_cls:
            input_shape[0] = params.model_config.temporal_patch_size

        num_classes = dataloaders.CLS_DS[ds_name]['num_classes']

        model_kwargs = {
            'num_classes': num_classes,
            'num_test_samples': num_test_samples
        }
        if is_aud_cls:
            inputs = {'audio': tf.keras.Input(shape=input_shape)}
            model_factory = aud_factory
        else:
            inputs = {'images': tf.keras.Input(shape=input_shape)}
            model_factory = vid_factory

        model = model_factory.build_model(params=params.model_config,
                                          override_params=model_kwargs,
                                          mode='predict')
        outputs = model(inputs, None)
        keras_model = tf.keras.Model(inputs=inputs, outputs=outputs)
        keras_model.loss_fn = model.loss_fn

        if params.mode == 'train':
            self.partial_restore(params, keras_model)

        logging.info('Number of parameters in model: %f M.',
                     keras_model.count_params() / 10.**6)

        learning_rate = schedules.get_learning_rate(
            params.train.optimizer.learning_rate)
        keras_model.optimizer = optimizers.get_optimizer(
            learning_rate, params.train.optimizer)
        return keras_model