we use a pre-trained [DenseNet121](http://openaccess.thecvf.com/content_cvpr_2017/papers/Huang_Densely_Connected_Convolutional_CVPR_2017_paper.pdf) for feature extraction. * We directly pad shorter videos to length `MAX_SEQ_LENGTH`. First, let's load up the [DataFrames](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). """ train_df = pd.read_csv("train.csv") test_df = pd.read_csv("test.csv") print(f"Total videos for training: {len(train_df)}") print(f"Total videos for testing: {len(test_df)}") center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE) def crop_center(frame): cropped = center_crop_layer(frame[None, ...]) cropped = cropped.numpy().squeeze() return cropped # Following method is modified from this tutorial: # https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub def load_video(path, max_frames=0): cap = cv2.VideoCapture(path) frames = [] try: while True:
The idea of FixRes is to first train a model on a smaller resolution dataset and then fine-tune it on a larger resolution dataset. This simple yet effective recipe leads to non-trivial performance improvements. Please refer to the [original paper](https://arxiv.org/abs/1906.06423) for results. """ # Reference: https://github.com/facebookresearch/FixRes/blob/main/transforms_v2.py. batch_size = 128 auto = tf.data.AUTOTUNE smaller_size = 128 bigger_size = 224 size_for_resizing = int((bigger_size / smaller_size) * bigger_size) central_crop_layer = layers.CenterCrop(bigger_size, bigger_size) def preprocess_initial(train, image_size): """Initial preprocessing function for training on smaller resolution. For training, do random_horizontal_flip -> random_crop. For validation, just resize. No color-jittering has been used. """ def _pp(image, label, train): if train: channels = image.shape[-1] begin, size, _ = tf.image.sample_distorted_bounding_box( tf.shape(image), tf.zeros([0, 0, 4], tf.float32),