def load_i3d_model(num_classes,eval_type='rgb', scope='RGB',spatial_squeeze=True, final_endpoint='Logits'): with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d( 400,spatial_squeeze=spatial_squeeze, final_endpoint=final_endpoint) adversarial_inputs_rgb = tf.nn.tanh(rgb_input + adv_flag*(mask_rgb*eps_rgb)) # adversarial_inputs_rgb = tf.nn.tanh(rgb_input + adv_flag*(eps_rgb)) rgb_logits, _ = rgb_model( adversarial_inputs_rgb, is_training=False, dropout_keep_prob=1.0) rgb_logits_dropout = tf.nn.dropout(rgb_logits, 1) rgb_logits = tf.layers.dense( rgb_logits_dropout, NUM_CLASSES, use_bias=True) with tf.variable_scope(scope): i3d_model = i3d.InceptionI3d( num_classes, spatial_squeeze=spatial_squeeze, final_endpoint=final_endpoint) dummy_input = tf.placeholder( tf.float32, shape=(None, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) i3d_model(dummy_input, is_training=False, dropout_keep_prob=1.0) return i3d_model
def init_model(self): _IMAGE_SIZE = 224 _SAMPLE_VIDEO_FRAMES = 79 NUM_CLASSES = 101 classes = open('classes.txt').read().splitlines() #RGB SETUP self.rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') self.rgb_logits, _ = rgb_model(self.rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '').replace( 'Conv3d', 'Conv2d').replace('conv_3d/w', 'weights').replace( 'conv_3d/b', 'biases').replace('RGB/inception_i3d', 'InceptionV1').replace( 'batch_norm', 'BatchNorm')] = variable self.rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) #FLOW SETUP self.flow_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') self.flow_logits, _ = flow_model(self.flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '').replace( 'Conv3d', 'Conv2d').replace('conv_3d/w', 'weights').replace( 'conv_3d/b', 'biases').replace('Flow/inception_i3d', 'InceptionV1').replace( 'batch_norm', 'BatchNorm')] = variable self.flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) self.rgb_predictions = tf.nn.softmax(self.rgb_logits) self.flow_predictions = tf.nn.softmax(self.flow_logits)
def train(): file_names, labels = read_csv(csv_path) print(f"read video {train_path + file_names[0] }") file_names = list(map(lambda filename: train_path + filename, file_names)) print( f"============================= fileNames: {file_names[0]} =========") dataset = build_dataset(file_names, labels) iter = dataset.make_one_shot_iterator() X, Y = iter.get_next() print(f"X: {X}, Y:{Y}") model = i3d.InceptionI3d(num_classes=num_classes, final_endpoint='Logits') logits, _ = model(X, is_training=True) learning_rate = 0.01 cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for epoch in range(n_epochs): for i in range(len(file_names)): _, loss = sess.run([optimizer, cost]) print("optimization finished")
def get_preds_tensor(input_mode='rgb', n_frames=16, batch_size=10): """Function to get the predictions tensor, input placeholder and saver object :param input_mode: One of 'rgb','flow','two_stream'""" if input_mode == 'rgb': rgb_variable_map = {} input_fr_rgb = tf.placeholder( tf.float32, shape=[batch_size, n_frames, _IMAGE_SIZE, _IMAGE_SIZE, 3], name="Input_Video_Placeholder") with tf.variable_scope('RGB'): #Building I3D for RGB-only input rgb_model = i3d.InceptionI3d(_NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(input_fr_rgb, is_training=False, dropout_keep_prob=1.0) print len(tf.global_variables()) for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '')] = variable print len(rgb_variable_map) rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) model_predictions = tf.nn.softmax(rgb_logits) top_classes = tf.argmax(model_predictions, axis=1) return top_classes,model_predictions, \ input_fr_rgb, rgb_saver
def model_visual_features(rgb_array): i3d_model = i3d.InceptionI3d(num_classes=_NUM_CLASSES, final_endpoint='Predictions') inp = tf.placeholder(tf.float32, [None, _FRAMES, _IMAGE_SIZE[0], _IMAGE_SIZE[1], 3]) predictions, end_points = i3d_model(inp, is_training=True, dropout_keep_prob=0.5) init_op = tf.global_variables_initializer() config = tf.ConfigProto() config.gpu_options.allow_growth = True # sample_input = np.zeros((5, 64, _IMAGE_SIZE[0], _IMAGE_SIZE[1], 3)) sample_input = rgb_array with tf.Session(config=config) as sess: sess.run(init_op) out_predictions, out_logits = sess.run( [predictions, end_points['Logits']], {inp: sample_input}) tf.reset_default_graph() return out_logits
def inference(rgb_inputs, flow_inputs): with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(rgb_inputs, is_training=True, dropout_keep_prob=DROPOUT_KEEP_PROB) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') flow_logits, _ = flow_model(flow_inputs, is_training=True, dropout_keep_prob=DROPOUT_KEEP_PROB) return rgb_logits, flow_logits
def testInitErrors(self): # Invalid `final_endpoint` string. with self.assertRaises(ValueError): _ = i3d.InceptionI3d(num_classes=_NUM_CLASSES, final_endpoint='Conv3d_1a_8x8') # Dropout keep probability must be in (0, 1]. i3d_model = i3d.InceptionI3d(num_classes=_NUM_CLASSES) inp = tf.placeholder(tf.float32, [None, 64, _IMAGE_SIZE, _IMAGE_SIZE, 3]) with self.assertRaises(ValueError): _, _ = i3d_model(inp, is_training=False, dropout_keep_prob=0) # Height and width dimensions of the input should be _IMAGE_SIZE. i3d_model = i3d.InceptionI3d(num_classes=_NUM_CLASSES) inp = tf.placeholder(tf.float32, [None, 64, 10, 10, 3]) with self.assertRaises(ValueError): _, _ = i3d_model(inp, is_training=False, dropout_keep_prob=0.5)
def get_model(streamType, numSeg): if streamType == 'rgb': # RGB input has 3 channels. rgb_input = tf.placeholder( tf.float32, shape=(numSeg, SAMPLE_VIDEO_FRAMES, IMAGE_SIZE, IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d( NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model( rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '')] = variable saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) elif streamType == 'flow': # Flow input has only 2 channels. flow_input = tf.placeholder( tf.float32, shape=(numSeg, SAMPLE_VIDEO_FRAMES, IMAGE_SIZE, IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d( NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') flow_logits, _ = flow_model( flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if streamType == 'rgb': model_logits = rgb_logits inputs = rgb_input elif streamType == 'flow': model_logits = flow_logits inputs = flow_input return saver, inputs, model_logits
def load_i3d_model(num_classes,eval_type='rgb', scope='RGB',spatial_squeeze=True, final_endpoint='Logits'): with tf.variable_scope(scope): i3d_model = i3d.InceptionI3d( num_classes, spatial_squeeze=spatial_squeeze, final_endpoint=final_endpoint) dummy_input = tf.placeholder( tf.float32, shape=(None, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) i3d_model(dummy_input, is_training=False, dropout_keep_prob=1.0) return i3d_model
def short_video_extraction(vname, n_frames, frame_path, feat_path): batch_frames = 64 # loading net rgb_input = tf.placeholder(tf.float32, shape=(1, batch_frames, 224, 224, 3)) with tf.variable_scope('RGB'): net = i3d.InceptionI3d(600, spatial_squeeze=True, final_endpoint='Logits') _, end_points = net(rgb_input, is_training=False, dropout_keep_prob=1.0) end_feature = end_points['avg_pool3d'] sess = tf.Session() rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace( ':0', '')[len('RGB/inception_i3d/'):]] = variable saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) saver.restore(sess, I3D_CKPT_PATH) features = [] for batch_i in range(math.ceil(n_frames / batch_frames)): input_blob = [] for idx in range(batch_frames): idx = (batch_i * batch_frames + idx) % n_frames + 1 image = Image.open(os.path.join(frame_path, '%06d.jpg' % idx)) image = image.resize((224, 224)) image = np.array(image, dtype='float32') image[:, :, :] -= 127.5 image[:, :, :] /= 127.5 input_blob.append(image) input_blob = np.array([input_blob], dtype='float32') clip_feature = sess.run(end_feature, feed_dict={rgb_input: input_blob}) clip_feature = np.reshape(clip_feature, (-1, clip_feature.shape[-1])) print(batch_i, clip_feature.shape) features.append(clip_feature) if len(features) > 1: features = np.concatenate(features, axis=0) else: features = features[0] features = features[:n_frames // 8] print('Saving features for video: %s ...' % vname) np.save(feat_path, features)
def i3d_loss(rgb, label, gpu_idx): """ Builds an I3D model and computes the loss """ with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d( config.num_classes, spatial_squeeze=True, final_endpoint='Logits', freeze_before_logits=FLAGS.freeze_up_to_logits) rgb_logits = rgb_model(rgb, is_training=True, dropout_keep_prob=1.0)[0] rgb_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=label, logits=rgb_logits, name='cross_entropy_rgb') loss_rgb = tf.reduce_mean(rgb_loss, name='rgb_ce') summary_loss = tf.summary.scalar('rgb_loss_%d' % gpu_idx, loss_rgb) return loss_rgb, summary_loss
def get_model_loss(input_x, input_y, train_flag, dropout_flag, scope, reuse=None, num_class=NUM_CLASS, top_k=TOP_K): with tf.variable_scope('RGB', reuse=reuse): model = i3d.InceptionI3d(num_classes=400, spatial_squeeze=True, final_endpoint='Logits') logits, _ = model(inputs=input_x, is_training=train_flag, dropout_keep_prob=dropout_flag) logits_dropout = tf.nn.dropout(logits, keep_prob=dropout_flag) out = tf.layers.dense(logits_dropout, num_class, activation=None, use_bias=True) is_in_top_K = tf.cast( tf.nn.in_top_k(predictions=out, targets=input_y, k=top_k), tf.float32) # prepare l2 loss regularization_loss = 0. for var in tf.global_variables(): var_name_type = var.name.split('/')[-1][:-2] if var_name_type == 'w' or var_name_type == 'kernel': regularization_loss += tf.nn.l2_loss(var) regularization_loss = tf.identity(regularization_loss, name='regularization_loss') loss_cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=input_y, logits=out, name='cross_entropy')) total_loss = tf.add(loss_cross_entropy, L2_PARAM * regularization_loss, name='total_loss') tf.summary.scalar('{}/loss_ratio'.format(scope), regularization_loss / loss_cross_entropy) return total_loss, loss_cross_entropy, is_in_top_K
def get_preds_loss(ground_truth, input_mode='rgb', n_frames=16, num_classes=_NUM_CLASSES, batch_size=10, dropout_keep_prob=0.6): """Function to get the predictions tensor, loss, input placeholder and saver object :param ground_truth: Tensor to hold ground truth :param input_mode: One of 'rgb','flow','two_stream'""" rgb_variable_map = {} input_fr_rgb = tf.placeholder( tf.float32, shape=[batch_size, n_frames, _IMAGE_SIZE, _IMAGE_SIZE, 3], name='Input_Video_Placeholder') with tf.variable_scope('RGB'): #Building I3D for RGB-only input rgb_model = i3d.InceptionI3d(spatial_squeeze=True, final_endpoint='Mixed_5c') rgb_mixed_5c, _ = rgb_model(input_fr_rgb, is_training=False, dropout_keep_prob=1.0) with tf.variable_scope('Logits_Mice'): net = tf.nn.avg_pool3d(rgb_mixed_5c, ksize=[1, 2, 7, 7, 1], strides=[1, 1, 1, 1, 1], padding='VALID') net = tf.nn.dropout(net, dropout_keep_prob) logits = conv3d(name='Logits', input=net, shape=[1, 1, 1, 1024, num_classes]) logits = tf.squeeze(logits, [2, 3], name='SpatialSqueeze') averaged_logits = tf.reduce_mean(logits, axis=1) for variable in tf.global_variables(): if variable.name.split( '/')[0] == 'RGB' and 'Logits' not in variable.name: rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) model_predictions = tf.nn.softmax(averaged_logits) top_classes = tf.argmax(model_predictions, axis=1) loss = get_loss(model_predictions, ground_truth) return model_predictions, loss, top_classes, input_fr_rgb, rgb_saver
def testModelShapesWithSqueeze(self): """Test shapes after running some fake data through the model.""" i3d_model = i3d.InceptionI3d(num_classes=_NUM_CLASSES, final_endpoint='Predictions') inp = tf.placeholder(tf.float32, [None, 64, _IMAGE_SIZE, _IMAGE_SIZE, 3]) predictions, end_points = i3d_model(inp, is_training=True, dropout_keep_prob=0.5) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) sample_input = np.zeros((5, 64, _IMAGE_SIZE, _IMAGE_SIZE, 3)) out_predictions, out_logits = sess.run( [predictions, end_points['Logits']], {inp: sample_input}) self.assertEqual(out_predictions.shape, (5, _NUM_CLASSES)) self.assertEqual(out_logits.shape, (5, _NUM_CLASSES))
def _build_stream(stream_name, is_training): dims = 3 if stream_name is 'RGB' else 2 input_shape = (_BATCH_SIZE, NUM_FRAMES, IMAGE_SIZE, IMAGE_SIZE, dims) inp = tf.placeholder(tf.float32, shape=input_shape) with tf.variable_scope(stream_name): model = i3d.InceptionI3d(spatial_squeeze=True, final_endpoint='Mixed_5c') # No training here. Don't backpropagate the main model, and no dropout. mixed_5c, _ = model(inp, is_training=False, dropout_keep_prob=1.0) var_map = {} for var in tf.global_variables(): if var.name.split('/')[0] == stream_name: var_map[var.name.replace(':0', '')] = var saver = tf.train.Saver(var_list=var_map, reshape=True) with tf.variable_scope(stream_name): net = tf.nn.avg_pool3d(mixed_5c, ksize=[1, 2, 7, 7, 1], strides=[1, 1, 1, 1, 1], padding=snt.VALID) logit_fn = i3d.Unit3D(output_channels=NUM_CLASSES, kernel_shape=[1, 1, 1], activation_fn=None, use_batch_norm=False, use_bias=True, regularizers={'w': tf.nn.l2_loss}, name='Conv3d_0c_1x1') logits = logit_fn(net, is_training=is_training) logits = tf.squeeze(logits, [2, 3], name='SpatialSqueeze') logits = tf.reduce_mean(logits, axis=1) custom_vars = {} for var in tf.global_variables(): name = var.name.replace(':0', '') if var.name.split( '/')[0] == stream_name and name not in var_map.keys(): custom_vars[name] = var with tf.name_scope(name): _variable_summaries(var) return (inp, logits, saver, custom_vars)
def load_model_and_test(train_generator, validation_generator, msasl_classes, rgb_input): ''' Trains for EPOCH on the train_generator's data and tests the validation set. ''' with tf.compat.v1.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=DROPOUT_KEEP_PROB) # The variable map is used to tell the saver which layers weights to restore. # (the weights of the layers are all stored in tf variables) rgb_variable_map = {} for variable in tf.compat.v1.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '')[len(''):]] = variable # rgb_variable_map[variable.name.replace(':0', '')[len('RGB/inception_i3d'):]] = variable # We remove the logits layers from the variable map. We don't want to include these weights as we have a # different number of classes. # layers = rgb_variable_map.keys() # layers_to_not_load = [layer for layer in layers if 'Logits' in layer] # unloaded_layers_to_init = {} # for layer in layers_to_not_load: # unloaded_layers_to_init[layer] = rgb_variable_map.pop(layer) rgb_saver = tf.compat.v1.train.Saver(var_list=rgb_variable_map, reshape=True) with tf.compat.v1.Session() as sess: rgb_saver.restore(sess, WEIGHTS_PATH) tf.compat.v1.logging.info('RGB checkpoint restored') validate(sess, train_generator, rgb_model, rgb_input, 'Train', msasl_classes) validate(sess, validation_generator, rgb_model, rgb_input, 'Validation', msasl_classes)
def testModelShapesWithoutSqueeze(self): """Test that turning off `spatial_squeeze` changes the output shape. Also try setting different values for `dropout_keep_prob` and snt.BatchNorm `is_training`. """ i3d_model = i3d.InceptionI3d(num_classes=_NUM_CLASSES, spatial_squeeze=False, final_endpoint='Predictions') inp = tf.placeholder(tf.float32, [None, 64, _IMAGE_SIZE, _IMAGE_SIZE, 3]) predictions, end_points = i3d_model(inp, is_training=False, dropout_keep_prob=1.0) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) sample_input = np.zeros((5, 64, _IMAGE_SIZE, _IMAGE_SIZE, 3)) out_predictions, out_logits = sess.run( [predictions, end_points['Logits']], {inp: sample_input}) self.assertEqual(out_predictions.shape, (5, 1, 1, _NUM_CLASSES)) self.assertEqual(out_logits.shape, (5, 1, 1, _NUM_CLASSES))
def main(dataset, mode, split): assert mode in ['rgb', 'flow', 'mixed'] log_dir = os.path.join(_LOG_ROOT, 'ensemble-%s-%s-%d' % (dataset, mode, split)) if not os.path.exists(log_dir): os.mkdir(log_dir) logging.basicConfig(level=logging.INFO, filename=os.path.join( log_dir, 'log-%s-%d' % (mode, split) + '.txt'), filemode='w', format='%(message)s') label_map = get_label_map(os.path.join('./data', dataset, 'label_map.txt')) _, test_info_rgb, class_num, _ = load_info(dataset, root=_DATA_ROOT[dataset], mode='rgb', split=split) _, test_info_flow, _, _ = load_info(dataset, root=_DATA_ROOT[dataset], mode='flow', split=split) label_holder = tf.placeholder(tf.int32, [None]) if mode in ['rgb', 'mixed']: rgb_data = ActionDataset(dataset, class_num, test_info_rgb, 'frame{:06d}{:s}.jpg', mode='rgb') rgb_holder = tf.placeholder( tf.float32, [None, None, _FRAME_SIZE, _FRAME_SIZE, _CHANNEL['rgb']]) info_rgb, _ = rgb_data.gen_test_list() if mode in ['flow', 'mixed']: flow_data = ActionDataset(dataset, class_num, test_info_flow, 'frame{:06d}{:s}.jpg', mode='flow') flow_holder = tf.placeholder( tf.float32, [None, None, _FRAME_SIZE, _FRAME_SIZE, _CHANNEL['flow']]) info_flow, _ = flow_data.gen_test_list(mode='flow') #print(info_rgb) # insert the model if mode in ['rgb', 'mixed']: with tf.variable_scope(_SCOPE['rgb']): rgb_model = i3d.InceptionI3d(400, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(rgb_holder, is_training=False, dropout_keep_prob=1) rgb_logits_dropout = tf.nn.dropout(rgb_logits, 1) rgb_fc_out = tf.layers.dense(rgb_logits_dropout, _CLASS_NUM[dataset], tf.nn.relu, use_bias=True) rgb_top_1_op = tf.nn.in_top_k(rgb_fc_out, label_holder, 1) if mode in ['flow', 'mixed']: with tf.variable_scope(_SCOPE['flow']): flow_model = i3d.InceptionI3d(400, spatial_squeeze=True, final_endpoint='Logits') flow_logits, _ = flow_model(flow_holder, is_training=False, dropout_keep_prob=1) flow_logits_dropout = tf.nn.dropout(flow_logits, 1) flow_fc_out = tf.layers.dense(flow_logits_dropout, _CLASS_NUM[dataset], use_bias=True) flow_top_1_op = tf.nn.in_top_k(flow_fc_out, label_holder, 1) # construct two separate feature map and saver(rgb_saver,flow_saver) variable_map = {} if mode in ['rgb', 'mixed']: for variable in tf.global_variables(): tmp = variable.name.split('/') if tmp[0] == _SCOPE['rgb']: variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=variable_map) variable_map = {} if mode in ['flow', 'mixed']: for variable in tf.global_variables(): tmp = variable.name.split('/') if tmp[0] == _SCOPE['flow']: variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=variable_map, reshape=True) # Edited Version by AlexHu if mode == 'rgb': fc_out = rgb_fc_out softmax = tf.nn.softmax(fc_out) if mode == 'flow': fc_out = flow_fc_out softmax = tf.nn.softmax(fc_out) if mode == 'mixed': fc_out = _MIX_WEIGHT_OF_RGB * rgb_fc_out + _MIX_WEIGHT_OF_FLOW * flow_fc_out softmax = tf.nn.softmax(fc_out) top_k_op = tf.nn.in_top_k(softmax, label_holder, 1) # GPU config # config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.6 # sess = tf.Session(config=config)# config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.6 # sess = tf.Session(config=config) # start a new session and restore the fine-tuned model sess = tf.Session() sess.run(tf.global_variables_initializer()) if mode in ['rgb', 'mixed']: rgb_saver.restore(sess, _CHECKPOINT_PATHS_RGB[int(split) - 1]) if mode in ['flow', 'mixed']: flow_saver.restore(sess, _CHECKPOINT_PATHS_FLOW[int(split) - 1]) if mode in ['rgb', 'mixed']: # Start Queue rgb_queue = FeedQueue(queue_size=_QUEUE_SIZE) rgb_queue.start_queue(rgb_data.get_video, args=info_rgb, process_num=_QUEUE_PROCESS_NUM) if mode in ['flow', 'mixed']: flow_queue = FeedQueue(queue_size=_QUEUE_SIZE) flow_queue.start_queue(flow_data.get_video, args=info_flow, process_num=_QUEUE_PROCESS_NUM) # Here we start the test procedure print('----Here we start!----') print('Output wirtes to ' + log_dir) true_count = 0 if mode in ['rgb', 'mixed']: video_size = len(info_rgb) if mode in ['flow', 'mixed']: video_size = len(info_flow) error_record = open(os.path.join(log_dir, 'error_record_' + mode + '.txt'), 'w') rgb_fc_data = np.zeros((video_size, _CLASS_NUM[dataset])) flow_fc_data = np.zeros((video_size, _CLASS_NUM[dataset])) label_data = np.zeros((video_size, 1)) # just load 1 video for test,this place needs to be improved y_pred = [] y_true = [] for i in range(video_size): #print(i) if mode in ['rgb', 'mixed']: rgb_clip, label, info = rgb_queue.feed_me() rgb_clip = rgb_clip / 255 #input_rgb = rgb_clip[np.newaxis, :, :, :, :] input_rgb = rgb_clip[np.newaxis, :, :, :, :] video_name = rgb_data.videos[info[0]].name if mode in ['flow', 'mixed']: flow_clip, label, info = flow_queue.feed_me() flow_clip = 2 * (flow_clip / 255) - 1 input_flow = flow_clip[np.newaxis, :, :, :, :] video_name = flow_data.videos[info[0]].name input_label = np.array([label]).reshape(-1) #print(type(input_label[0])) # print('input_rgb.shape:', input_rgb.shape) # print('input_flow.shape:', input_flow.shape) # print('input_label.shape:', input_label.shape) # Extract features from rgb and flow if mode in ['rgb']: top_1, predictions, curr_rgb_fc_data = sess.run( [top_k_op, fc_out, rgb_fc_out], feed_dict={ rgb_holder: input_rgb, label_holder: input_label }) if mode in ['flow']: top_1, predictions, curr_flow_fc_data = sess.run( [top_k_op, fc_out, flow_fc_out], feed_dict={ flow_holder: input_flow, label_holder: input_label }) if mode in ['mixed']: top_1, predictions, curr_rgb_fc_data, curr_flow_fc_data = sess.run( [top_k_op, fc_out, rgb_fc_out, flow_fc_out], feed_dict={ rgb_holder: input_rgb, flow_holder: input_flow, label_holder: input_label }) if mode in ['rgb', 'mixed']: rgb_fc_data[i, :] = curr_rgb_fc_data if mode in ['flow', 'mixed']: flow_fc_data[i, :] = curr_flow_fc_data label_data[i, :] = label tmp = np.sum(top_1) true_count += tmp print('Video %d - frame %d-%d: %d, accuracy: %.4f (%d/%d) , name: %s' % (info[0], info[2], info[2] + info[1], tmp, true_count / video_size, true_count, video_size, video_name)) logging.info( 'Video%d-frame%d-%d: %d, accuracy: %.4f (%d/%d) , name:%s' % (info[0], info[2], info[2] + info[1], tmp, true_count / video_size, true_count, video_size, video_name)) # self_added # print(predictions[0, np.argmax(predictions, axis=1)[0]]) # print(trans_label(np.argmax(predictions, axis=1)[0], label_map)) # print(np.argmax(label)) #print(trans_label(np.argmax(label), label_map)) y_true.append(trans_label(np.int64(input_label[0]), label_map)) answer = np.argmax(predictions, axis=1)[0] y_pred.append(trans_label(answer, label_map)) if tmp == 0: wrong_answer = np.argmax(predictions, axis=1)[0] #print(label_map[wrong_answer]) # Attention: the graph output are converted into the type of numpy.array print('---->answer: %s, probability: %.2f' % (trans_label( wrong_answer, label_map), predictions[0, wrong_answer])) #print(predictions) logging.info('---->answer: %s, probability: %.2f' % (trans_label( wrong_answer, label_map), predictions[0, wrong_answer])) error_record.write( 'video: %s, frame: %d-%d, answer: %s, true: %s, probability: answer-%.2f true-%.2f\n' % (video_name, info[2], info[2] + info[1], trans_label(wrong_answer, label_map), trans_label(np.int64(input_label[0]), label_map), predictions[0, wrong_answer], predictions[0, np.int64(input_label[0])])) error_record.close() accuracy = true_count / video_size print('test accuracy: %.4f' % (accuracy)) logging.info('test accuracy: %.4f' % (accuracy)) if mode in ['rgb', 'mixed']: np.save( os.path.join(log_dir, 'obj_{}_rgb_fc_{}.npy').format(dataset, accuracy), rgb_fc_data) if mode in ['flow', 'mixed']: np.save( os.path.join(log_dir, 'obj_{}_flow_fc_{}.npy').format(dataset, accuracy), flow_fc_data) np.save( os.path.join(log_dir, 'obj_{}_label.npy').format(dataset), label_data) if mode in ['rgb', 'mixed']: rgb_queue.close_queue() if mode in ['flow', 'mixed']: flow_queue.close_queue() sess.close() #print(y_pred) #print(y_true) cf_matrix = confusion_matrix(y_true, y_pred, labels=label_map) print(cf_matrix) np.save(os.path.join(log_dir, 'cf_matrix_{}.npy'.format(mode)), cf_matrix)
def main(): # manually shuffle the train txt file because tf.data.shuffle is soooo slow! shuffle_and_overwrite(os.path.join(TXT_DATA_DIR, TRAIN_FILE)) # dataset loading using tf.data module with tf.device('/cpu:0'): train_dataset = tf.data.Dataset.from_tensor_slices( [os.path.join(TXT_DATA_DIR, TRAIN_FILE)]) train_dataset = train_dataset.apply( tf.contrib.data.parallel_interleave( lambda x: tf.data.TextLineDataset(x).map( lambda x: tf.string_split([x], delimiter=' ').values), cycle_length=NUM_THREADS, block_length=1)) train_dataset = train_dataset.apply( tf.contrib.data.map_and_batch(lambda x: tuple( tf.py_func(get_data_func_train, [x, IMAGE_SIZE], [tf.float32, tf.int64])), batch_size=BATCH_SIZE, num_parallel_batches=NUM_THREADS)) train_dataset.prefetch(PREFETCH_BUFFER) val_dataset = tf.data.Dataset.from_tensor_slices( [os.path.join(TXT_DATA_DIR, VAL_FILE)]) val_dataset = val_dataset.shuffle(VAL_LEN) val_dataset = val_dataset.apply( tf.contrib.data.parallel_interleave( lambda x: tf.data.TextLineDataset(x).map( lambda x: tf.string_split([x], delimiter=' ').values), cycle_length=NUM_THREADS, block_length=1)) val_dataset = val_dataset.apply( tf.contrib.data.map_and_batch(lambda x: tuple( tf.py_func(get_data_func_val, [x, IMAGE_SIZE], [tf.float32, tf.int64])), batch_size=BATCH_SIZE, num_parallel_batches=NUM_THREADS)) val_dataset.prefetch(PREFETCH_BUFFER) train_iterator = train_dataset.make_initializable_iterator() val_iterator = val_dataset.make_initializable_iterator() train_handle = train_iterator.string_handle() val_handle = val_iterator.string_handle() handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag') dataset_iterator = tf.data.Iterator.from_string_handle( handle_flag, train_dataset.output_types, train_dataset.output_shapes) batch_vid, batch_label = dataset_iterator.get_next() batch_vid.set_shape([None, None, IMAGE_SIZE, IMAGE_SIZE, 3]) train_flag = tf.placeholder(dtype=tf.bool, name='train_flag') dropout_flag = tf.placeholder(dtype=tf.float32, name='dropout_flag') # define model here with tf.variable_scope('RGB'): model = i3d.InceptionI3d(num_classes=400, spatial_squeeze=True, final_endpoint='Logits') logits, _ = model(inputs=batch_vid, is_training=train_flag, dropout_keep_prob=dropout_flag) logits_dropout = tf.nn.dropout(logits, keep_prob=dropout_flag) out = tf.layers.dense(logits_dropout, NUM_CLASS, activation=None, use_bias=True) is_in_top_K = tf.nn.in_top_k(predictions=out, targets=batch_label, k=TOP_K) # maintain a variable map to restore from the ckpt variable_map = {} for var in tf.global_variables(): var_name_split = var.name.split('/') if var_name_split[ 1] == 'inception_i3d' and 'dense' not in var_name_split[1]: variable_map[var.name[:-2]] = var if var_name_split[-1][:-2] == 'w' or var_name_split[ -1][:-2] == 'kernel': tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(var)) # optional: print to check the variable names # pprint(variable_map) regularization_loss = tf.losses.get_regularization_loss( name='regularization_loss') # sum of l2 loss loss_cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=batch_label, logits=out, name='cross_entropy')) total_loss = tf.add(loss_cross_entropy, L2_PARAM * regularization_loss, name='total_loss') tf.summary.scalar('batch_statistics/total_loss', total_loss) tf.summary.scalar('batch_statistics/cross_entropy_loss', loss_cross_entropy) tf.summary.scalar('batch_statistics/l2_loss', regularization_loss) tf.summary.scalar('batch_statistics/loss_ratio', regularization_loss / loss_cross_entropy) saver_to_restore = tf.train.Saver(var_list=variable_map, reshape=True) batch_num = TRAIN_LEN / BATCH_SIZE global_step = tf.Variable(GLOBAL_STEP_INIT, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) learning_rate = config_learning_rate(global_step, batch_num) tf.summary.scalar('learning_rate', learning_rate) # set dependencies for BN ops update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = config_optimizer(OPTIMIZER, learning_rate, OPT_EPSILON) train_op = optimizer.minimize(total_loss, global_step=global_step) # NOTE: if you don't want to save the params of the optimizer into the checkpoint, # you can place this line before the `update_ops` line saver_to_save = tf.train.Saver(max_to_keep=40) with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) train_handle_value, val_handle_value = sess.run( [train_handle, val_handle]) sess.run(train_iterator.initializer) saver_to_restore.restore(sess, CHECKPOINT_PATH) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter( os.path.join(TENSORBOARD_LOG_DIR, 'train'), sess.graph) sys.stdout.write('\n----------- start to train -----------\n') intermediate_train_info = [0., 0.] for epoch in range(EPOCH_NUM): epoch_acc, epoch_loss = 0., 0. pbar = tqdm(total=batch_num, desc='Epoch {}'.format(epoch), unit=' batch (batch_size: {})'.format(BATCH_SIZE)) for i in range(batch_num): _, _loss_cross_entropy, _is_in_top_K, summary, _global_step, lr = sess.run( [ train_op, loss_cross_entropy, is_in_top_K, merged, global_step, learning_rate ], feed_dict={ train_flag: True, dropout_flag: DROPOUT_KEEP_PRAM, handle_flag: train_handle_value }) train_writer.add_summary(summary, global_step=_global_step) intermediate_train_info[0] += np.sum(_is_in_top_K) intermediate_train_info[1] += _loss_cross_entropy epoch_acc += np.sum(_is_in_top_K) epoch_loss += _loss_cross_entropy # intermediate evaluation for the training dataset if _global_step % SHOW_TRAIN_INFO_FREQ == 0: intermediate_train_acc = float( intermediate_train_info[0]) / ( SHOW_TRAIN_INFO_FREQ * BATCH_SIZE) intermediate_train_loss = intermediate_train_info[ 1] / SHOW_TRAIN_INFO_FREQ step_log_info = 'Epoch:{}, global_step:{}, step_train_acc:{:.4f}, step_train_loss:{:4f}, lr:{:.7g}'.format( epoch, _global_step, intermediate_train_acc, intermediate_train_loss, lr) sys.stdout.write('\n' + step_log_info + '\n') sys.stdout.flush() logging.info(step_log_info) train_writer.add_summary(make_summary( 'accumulated_statistics/train_acc', intermediate_train_acc), global_step=_global_step) train_writer.add_summary(make_summary( 'accumulated_statistics/train_loss', intermediate_train_loss), global_step=_global_step) intermediate_train_info = [0., 0.] # start to evaluate if _global_step % SAVE_FREQ == 0: if intermediate_train_acc >= 0.8: saver_to_save.save( sess, SAVE_DIR + '/model_step_{}_lr_{:.7g}'.format( _global_step, lr)) pbar.update(1) pbar.close() # start to validata on the validation dataset sess.run(val_iterator.initializer) iter_num = int(np.ceil(float(VAL_LEN) / BATCH_SIZE)) correct_cnt, loss_cnt = 0, 0 pbar = tqdm(total=iter_num, desc='EVAL train_epoch:{}'.format(epoch), unit=' batch(batch_size={})'.format(BATCH_SIZE)) for _ in range(iter_num): _is_in_top_K, _loss_cross_entropy = sess.run( [is_in_top_K, loss_cross_entropy], feed_dict={ handle_flag: val_handle_value, train_flag: False, dropout_flag: 1.0 }) correct_cnt += np.sum(_is_in_top_K) loss_cnt += _loss_cross_entropy pbar.update(1) pbar.close() val_acc = float(correct_cnt) / VAL_LEN val_loss = float(loss_cnt) / iter_num log_info = '==>> Epoch:{}, global_step:{}, val_acc:{:.4f}, val_loss:{:4f}, lr:{:.7g}'.format( epoch, _global_step, val_acc, val_loss, lr) logging.info(log_info) sys.stdout.write('\n' + log_info + '\n') sys.stdout.flush() # manually shuffle the data with python for better performance shuffle_and_overwrite(os.path.join(TXT_DATA_DIR, TRAIN_FILE)) sess.run(train_iterator.initializer) epoch_acc = float(epoch_acc) / TRAIN_LEN epoch_loss = float(epoch_loss) / batch_num log_info = '==========Epoch:{}, whole_train_acc:{:.4f}, whole_train_loss:{:4f}, lr:{:.7g}=========='.format( epoch, epoch_acc, epoch_loss, lr) logging.info(log_info) sys.stdout.write('\n' + log_info + '\n') sys.stdout.flush() train_writer.close()
def feature_extractor(): # loading net net = i3d.InceptionI3d(400, spatial_squeeze=True, final_endpoint='Logits') rgb_input = tf.placeholder(tf.float32, shape=(batch_size, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) _, end_points = net(rgb_input, is_training=False, dropout_keep_prob=1.0) end_feature = end_points['avg_pool3d'] sess = tf.Session() rgb_variable_map = {} for variable in tf.global_variables(): rgb_variable_map[variable.name.replace( ':0', '')[len('inception_i3d/'):]] = variable saver = tf.train.Saver(var_list=rgb_variable_map) saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) video_list = open(VIDEO_PATH_FILE).readlines() video_list = [name.strip() for name in video_list] print('video_list', video_list) if not os.path.isdir(OUTPUT_FEAT_DIR): os.mkdir(OUTPUT_FEAT_DIR) print('Total number of videos: %d' % len(video_list)) for cnt, video_name in enumerate(video_list): video_path = os.path.join(VIDEO_DIR, video_name) feat_path = os.path.join(OUTPUT_FEAT_DIR, video_name + '.npy') if os.path.exists(feat_path): print('Feature file for video %s already exists.' % video_name) continue print('video_path', video_path) n_frame = len( [ff for ff in os.listdir(video_path) if ff.endswith('.jpg')]) print('Total frames: %d' % n_frame) features = [] n_feat = int(n_frame // 8) n_batch = n_feat // batch_size + 1 print('n_frame: %d; n_feat: %d' % (n_frame, n_feat)) print('n_batch: %d' % n_batch) for i in range(n_batch): input_blobs = [] for j in range(batch_size): input_blob = [] for k in range(L): idx = i * batch_size * L + j * L + k idx = int(idx) idx = idx % n_frame + 1 image = Image.open(os.path.join(video_path, '%d.jpg' % idx)) image = image.resize((resize_w, resize_h)) image = np.array(image, dtype='float32') ''' image[:, :, 0] -= 104. image[:, :, 1] -= 117. image[:, :, 2] -= 123. ''' image[:, :, :] -= 127.5 image[:, :, :] /= 127.5 input_blob.append(image) input_blob = np.array(input_blob, dtype='float32') input_blobs.append(input_blob) input_blobs = np.array(input_blobs, dtype='float32') clip_feature = sess.run(end_feature, feed_dict={rgb_input: input_blobs}) clip_feature = np.reshape(clip_feature, (-1, clip_feature.shape[-1])) features.append(clip_feature) features = np.concatenate(features, axis=0) features = features[:n_feat: 2] # 16 frames per feature (since 64-frame snippet corresponds to 8 features in I3D) feat_path = os.path.join(OUTPUT_FEAT_DIR, video_name + '.npy') print('Saving features and probs for video: %s ...' % video_name) np.save(feat_path, features) print('%d: %s has been processed...' % (cnt, video_name))
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) eval_type = FLAGS.eval_type imagenet_pretrained = FLAGS.imagenet_pretrained if eval_type not in ['rgb', 'flow', 'joint']: raise ValueError('Bad `eval_type`, must be one of rgb, flow, joint') kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)] if eval_type in ['rgb', 'joint']: # RGB input has 3 channels. rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(_NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) if eval_type in ['flow', 'joint']: # Flow input has only 2 channels. flow_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(_NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') flow_logits, _ = flow_model(flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if eval_type == 'rgb': model_logits = rgb_logits elif eval_type == 'flow': model_logits = flow_logits else: model_logits = rgb_logits + flow_logits model_predictions = tf.nn.softmax(model_logits) with tf.Session() as sess: feed_dict = {} if eval_type in ['rgb', 'joint']: if imagenet_pretrained: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) else: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb']) tf.logging.info('RGB checkpoint restored') rgb_sample = np.load(_SAMPLE_PATHS['rgb']) tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape)) feed_dict[rgb_input] = rgb_sample if eval_type in ['flow', 'joint']: if imagenet_pretrained: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow_imagenet']) else: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow']) tf.logging.info('Flow checkpoint restored') flow_sample = np.load(_SAMPLE_PATHS['flow']) tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape)) feed_dict[flow_input] = flow_sample out_logits, out_predictions = sess.run( [model_logits, model_predictions], feed_dict=feed_dict) out_logits = out_logits[0] out_predictions = out_predictions[0] sorted_indices = np.argsort(out_predictions)[::-1] print('Norm of logits: %f' % np.linalg.norm(out_logits)) print('\nTop classes and probabilities') for index in sorted_indices[:20]: print(out_predictions[index], out_logits[index], kinetics_classes[index])
def run(max_steps=64e3, mode='rgb', root='', split='', batch_size=1, save_dir=''): #tf.logging.set_verbosity(tf.logging.INFO) eval_type = mode imagenet_pretrained = False NUM_CLASSES = 400 if eval_type == 'rgb600': NUM_CLASSES = 600 if eval_type not in ['rgb', 'rgb600', 'flow', 'joint']: raise ValueError( 'Bad `eval_type`, must be one of rgb, rgb600, flow, joint') if eval_type == 'rgb600': kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)] else: kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)] if eval_type in ['rgb', 'rgb600', 'joint']: # RGB input has 3 channels. rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Mixed_5c') rgb_logits, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': if eval_type == 'rgb600': rgb_variable_map[variable.name.replace( ':0', '')[len('RGB/inception_i3d/'):]] = variable else: rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) if eval_type in ['flow', 'joint']: # Flow input has only 2 channels. flow_input = tf.placeholder(tf.float32, shape=(None, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Mixed_5c') flow_logits, _ = flow_model(flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if eval_type == 'rgb' or eval_type == 'rgb600': model_logits = rgb_logits elif eval_type == 'flow': model_logits = flow_logits else: model_logits = rgb_logits + flow_logits #model_predictions = tf.nn.softmax(model_logits) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'training', root, mode, test_transforms, save_dir=save_dir) with tf.Session() as sess: feed_dict = {} while True: inputs, labels, name = dataset.next_batch() if name == '0': break i = 0 for input in inputs: i += 1 c, t, h, w = input.shape if eval_type in ['rgb', 'rgb600', 'joint']: if imagenet_pretrained: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) else: rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type]) #tf.logging.info('RGB checkpoint restored') rgb_sample = input[np.newaxis, :] #tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape)) feed_dict[rgb_input] = rgb_sample if eval_type in ['flow', 'joint']: if imagenet_pretrained: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow_imagenet']) else: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow']) #tf.logging.info('Flow checkpoint restored') flow_sample = input[np.newaxis, :] # tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape)) feed_dict[flow_input] = flow_sample out_logits = sess.run([model_logits], feed_dict=feed_dict) out_logits = out_logits[0] new_path = os.path.join(save_dir, name, mode) if not os.path.exists(new_path): os.makedirs(new_path) np.save(os.path.join(new_path, str(i)), out_logits.reshape(1024))
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) eval_type = FLAGS.eval_type imagenet_pretrained = FLAGS.imagenet_pretrained final_endpoint = FLAGS.final_endpoint NUM_CLASSES = 400 if eval_type == 'rgb600': NUM_CLASSES = 600 if eval_type not in ['rgb', 'rgb600', 'flow', 'joint']: raise ValueError( 'Bad `eval_type`, must be one of rgb, rgb600, flow, joint') if final_endpoint not in ['Mixed_4f', 'Logits', 'Predictions']: raise ValueError( 'Bad `final_endpoint`, must be one of Mixed_4f, Logits, Predictions' ) if eval_type == 'rgb600': kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)] else: kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)] if eval_type in ['rgb', 'rgb600', 'joint']: # RGB input has 3 channels. rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint=final_endpoint) rgb_output, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': if eval_type == 'rgb600': rgb_variable_map[variable.name.replace( ':0', '')[len('RGB/inception_i3d/'):]] = variable else: rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) if eval_type in ['flow', 'joint']: # Flow input has only 2 channels. flow_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint=final_endpoint) flow_output, _ = flow_model(flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if eval_type == 'rgb' or eval_type == 'rgb600': model_output = rgb_output elif eval_type == 'flow': model_output = flow_output else: model_output = rgb_output + flow_output with tf.Session() as sess: feed_dict = {} if eval_type in ['rgb', 'rgb600', 'joint']: if imagenet_pretrained: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) else: rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type]) tf.logging.info('RGB checkpoint restored') rgb_sample = np.load(_PATHS['rgb']) rgb_sample = rgb_sample[np.newaxis, ...] tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape)) feed_dict[rgb_input] = rgb_sample if eval_type in ['flow', 'joint']: if imagenet_pretrained: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow_imagenet']) else: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow']) tf.logging.info('Flow checkpoint restored') flow_sample = np.load(_PATHS['flow']) flow_sample = flow_sample[np.newaxis, ...] tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape)) feed_dict[flow_input] = flow_sample output = sess.run(model_output, feed_dict=feed_dict) np.save(_SAVE_PATH, output)
def train_from_kinetics_weights(train_generator, validation_generator, msasl_classes, rgb_input): ''' Trains for EPOCH on the train_generator's data and tests the validation set. ''' with tf.compat.v1.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=DROPOUT_KEEP_PROB) # The variable map is used to tell the saver which layers weights to restore. # (the weights of the layers are all stored in tf variables) rgb_variable_map = {} for variable in tf.compat.v1.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '')[len(''):]] = variable # rgb_variable_map[variable.name.replace(':0', '')[len('RGB/inception_i3d'):]] = variable # We remove the logits layers from the variable map. We don't want to include these weights as we have a # different number of classes. layers = rgb_variable_map.keys() layers_to_not_load = [layer for layer in layers if 'Logits' in layer] unloaded_layers_to_init = {} for layer in layers_to_not_load: unloaded_layers_to_init[layer] = rgb_variable_map.pop(layer) rgb_saver = tf.compat.v1.train.Saver(var_list=rgb_variable_map, reshape=True) model_logits = rgb_logits model_predictions = tf.nn.softmax(model_logits) with tf.compat.v1.Session() as sess: feed_dict = {} # Restore all the layers but the logits from the shared weights. rgb_saver.restore(sess, CHECKPOINT_PATHS['rgb']) tf.compat.v1.logging.info('RGB checkpoint restored') # Initialize the logits (final layer). Not sure exactly how they will be initialized. # TODO: Look into how the logits will be initialized. Could try different approaches. sess.run( tf.compat.v1.variables_initializer( list(unloaded_layers_to_init.values()))) # Preparing a new saver on all the layers to save weights as we train. # TODO: Use this saver to save in training. rgb_variable_map.update(unloaded_layers_to_init) rgb_saver = tf.compat.v1.train.Saver(var_list=rgb_variable_map, reshape=True) # input and output shapes rgb_logits, _ = rgb_model(rgb_input, is_training=True, dropout_keep_prob=DROPOUT_KEEP_PROB) rgb_labels = tf.compat.v1.placeholder(tf.float32, [None, NUM_CLASSES]) # TODO: Try with a more reasonable learning rate # global_step and decayed_lr can be used to decay the learning rate exponentially global_step = tf.compat.v1.placeholder(tf.int32) decayed_lr = tf.compat.v1.train.exponential_decay( learning_rate=ADAM_INIT_LR, global_step=global_step, decay_steps=5, decay_rate=0.95) # Loss and optimizer to use # TODO: Try with different loss functions and optimizers # loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=rgb_logits, labels=rgb_labels) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=rgb_logits, labels=rgb_labels)) # optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) optimizer = tf.compat.v1.train.MomentumOptimizer( learning_rate=LEARNING_RATE, momentum=MOMENTUM) # optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=decayed_lr, epsilon=ADAM_EPS) minimize = optimizer.minimize(loss) sess.run(tf.compat.v1.variables_initializer(optimizer.variables())) # One step or batch of training. def step(samples, labels, i): """Performs one optimizer step on a single mini-batch.""" feed_dict[rgb_input] = samples feed_dict[rgb_labels] = labels feed_dict[global_step] = i result = sess.run([loss, minimize], feed_dict=feed_dict) return result # TODO: Should this be running loss and need to be fixed? # One epoch of training def epoch(data_generator, i): for images, labels in tqdm(data_generator, desc='EPOCH' + str(i)): result = step(images, labels, i) data_generator.on_epoch_end() print("Loss" + str(result[0])) return result[0] # val_accuracy_prior = 0 # summary_saver keeps track of epoch_num, loss, traning acc., validation acc. summary_saver = [] for i in range(EPOCHS): epoch_loss = epoch(train_generator, i) summary_saver.append([i + 1, epoch_loss]) if i % 10 == 0: train_accuracy = validate(sess, train_generator, rgb_model, rgb_input, 'Train') summary_saver[i].append(train_accuracy) # evaluate validation set for every epoch val_accuracy = validate(sess, validation_generator, rgb_model, rgb_input, 'Validation') summary_saver[i].append(val_accuracy) rgb_saver.save(sess, NEW_CHECKPOINT_PATHS + NAME + 'acc' + ('%.5f' % val_accuracy), global_step=i) if i % 5 == 0: print_train_summary(summary_saver)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) eval_type = FLAGS.eval_type imagenet_pretrained = FLAGS.imagenet_pretrained NUM_CLASSES = 400 if eval_type == 'rgb600': NUM_CLASSES = 600 if eval_type not in ['rgb', 'rgb600', 'flow', 'joint']: raise ValueError( 'Bad `eval_type`, must be one of rgb, rgb600, flow, joint') if eval_type == 'rgb600': kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)] else: kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)] if eval_type in ['rgb', 'rgb600', 'joint']: # RGB input has 3 channels. rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': if eval_type == 'rgb600': rgb_variable_map[variable.name.replace( ':0', '')[len('RGB/inception_i3d/'):]] = variable else: rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) if eval_type in ['flow', 'joint']: # Flow input has only 2 channels. flow_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') flow_logits, _ = flow_model(flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if eval_type == 'rgb' or eval_type == 'rgb600': model_logits = rgb_logits elif eval_type == 'flow': model_logits = flow_logits else: model_logits = rgb_logits + flow_logits model_predictions = tf.nn.softmax(model_logits) with tf.Session() as sess: feed_dict = {} if eval_type in ['rgb', 'rgb600', 'joint']: if imagenet_pretrained: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) else: rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type]) tf.logging.info('RGB checkpoint restored') rgb_sample = np.load(_SAMPLE_PATHS['rgb']) tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape)) feed_dict[rgb_input] = rgb_sample if eval_type in ['flow', 'joint']: if imagenet_pretrained: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow_imagenet']) else: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow']) tf.logging.info('Flow checkpoint restored') flow_sample = np.load(_SAMPLE_PATHS['flow']) tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape)) feed_dict[flow_input] = flow_sample out_logits, out_predictions = sess.run( [model_logits, model_predictions], feed_dict=feed_dict) out_logits = out_logits[0] out_predictions = out_predictions[0] sorted_indices = np.argsort(out_predictions)[::-1] print('Norm of logits: %f' % np.linalg.norm(out_logits)) print('\nTop classes and probabilities') for index in sorted_indices[:20]: print(out_predictions[index], out_logits[index], kinetics_classes[index]) frozen_model_name = "frozen_model.pb" print('\nFreezing and Exporting Model as: %s' % frozen_model_name) frozen_graph = tf.graph_util.convert_variables_to_constants( sess, tf.get_default_graph().as_graph_def(), ["Softmax"]) with tf.io.gfile.GFile(frozen_model_name, "wb") as f: f.write(frozen_graph.SerializeToString()) print("%d ops in the final graph." % len(frozen_graph.node))
REGULARIZATION_RATE = 0.0001 TRAINING_STEPS = 97000 MODEL_SAVE_PATH = "/home/yzy_17/workspace/kinetics-i3d-master/data/checkpoints/rgb_imagenet" MODEL_NAME = "../data/checkpoints/rgb_imagenet/model.ckpt" ##!!myn with open('ucf_rgb_list.json', 'r') as f: video_lists = json.load(f) n_classes = len(video_lists.keys()) #导入原模型,完成前馈过程 rgb_input = tf.placeholder(tf.float32, shape=(1, None, IMAGE_SIZE, IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(n_classes, spatial_squeeze=True, final_endpoint='Mixed_5c') bottleneck, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': variable_map[variable.name.replace(':0', '')] = variable saver1 = tf.train.Saver(var_list=variable_map, reshape=True) with tf.variable_scope("RGB", reuse=True): w_jieduan = tf.get_variable( 'inception_i3d/Mixed_4e/Branch_3/Conv3d_0b_1x1/batch_norm/beta') output_conv_sg = tf.stop_gradient(w_jieduan)
tf.flags.DEFINE_boolean('imagenet_pretrained', True, '') tf.logging.set_verbosity(tf.logging.INFO) eval_type = FLAGS.eval_type imagenet_pretrained = FLAGS.imagenet_pretrained '''Placeholder of input images''' labels_placeholder = tf.placeholder(tf.float32, [_BATCH_SIZE, NUM_CLASSES]) keep_prob = tf.placeholder(tf.float32) '''Build the proposal network with i3d''' # Proposal RGB input has 3 channels. proposal_input = tf.placeholder(tf.float32, shape=(_BATCH_SIZE, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): proposal_model = i3d.InceptionI3d( NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits' ) proposal_logits, _ = proposal_model( proposal_input, is_training=True, dropout_keep_prob=keep_prob ) proposal_variable_map = {} proposal_saver_savedata = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': proposal_variable_map[variable.name.replace(':0', '')] = variable proposal_saver_savedata = tf.train.Saver(var_list=proposal_variable_map, reshape=True) model_logits = proposal_logits '''Tensorflow output definition''' model_predictions = tf.nn.softmax(model_logits) output_class = tf.argmax(model_predictions, 1)
def main(dataset='ucf101', mode='rgb', split=1): assert mode in ['rgb', 'flow'], 'Only RGB data and flow data is supported' log_dir = os.path.join(_LOG_ROOT, 'finetune-%s-%s-%d' % (dataset, mode, split)) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.basicConfig(level=logging.INFO, filename=os.path.join(log_dir, 'log.txt'), filemode='w', format='%(message)s') ## Data Preload ### train_info, test_info = split_data( os.path.join('./data', dataset, mode + '.csv'), os.path.join('./data', dataset, 'testlist%02d' % split + '.txt')) # os.path.join('/data1/yunfeng/i3d_test/data', dataset, mode+'.txt'), # os.path.join('/data1/yunfeng/i3d_test/data', dataset, 'testlist%02d' % split+'.txt')) train_data = Action_Dataset(dataset, mode, train_info) test_data = Action_Dataset(dataset, mode, test_info) num_train_sample = len(train_info) # Every element in train_info is shown as below: # ['v_ApplyEyeMakeup_g08_c01', # '/data4/zhouhao/dataset/ucf101/jpegs_256/v_ApplyEyeMakeup_g08_c01', # '121', '0'] train_info_tensor = tf.constant(train_info) test_info_tensor = tf.constant(test_info) # Dataset building # Phase 1 Trainning # one element in this dataset is (train_info list) train_info_dataset = tf.data.Dataset.from_tensor_slices( (train_info_tensor)) # one element in this dataset is (single image_postprocess, single label) # one element in this dataset is (batch image_postprocess, batch label) train_info_dataset = train_info_dataset.shuffle( buffer_size=num_train_sample) train_dataset = train_info_dataset.map( lambda x: _get_data_label_from_info(x, dataset, mode), num_parallel_calls=_NUM_PARALLEL_CALLS) train_dataset = train_dataset.repeat().batch(_BATCH_SIZE) train_dataset = train_dataset.prefetch(buffer_size=_PREFETCH_BUFFER_SIZE) # Phase 2 Testing # one element in this dataset is (train_info list) test_info_dataset = tf.data.Dataset.from_tensor_slices((test_info_tensor)) # one element in this dataset is (single image_postprocess, single label) test_dataset = test_info_dataset.map( lambda x: _get_data_label_from_info(x, dataset, mode), num_parallel_calls=_NUM_PARALLEL_CALLS) # one element in this dataset is (batch image_postprocess, batch label) test_dataset = test_dataset.batch(1).repeat() test_dataset = test_dataset.prefetch(buffer_size=_PREFETCH_BUFFER_SIZE) # iterator = dataset.make_one_shot_iterator() # clip_holder, label_holder = iterator.get_next() iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) train_init_op = iterator.make_initializer(train_dataset) test_init_op = iterator.make_initializer(test_dataset) clip_holder, label_holder = iterator.get_next() clip_holder = tf.squeeze(clip_holder, [1]) label_holder = tf.squeeze(label_holder, [1]) clip_holder.set_shape( [None, None, _FRAME_SIZE, _FRAME_SIZE, _CHANNEL[mode]]) dropout_holder = tf.placeholder(tf.float32) is_train_holder = tf.placeholder(tf.bool) # inference module # Inference Module with tf.variable_scope(_SCOPE[train_data.mode]): # insert i3d model model = i3d.InceptionI3d(400, spatial_squeeze=True, final_endpoint='Logits') # the line below outputs the final results with logits # __call__ uses _template, and _template uses _build when defined logits, _ = model(clip_holder, is_training=is_train_holder, dropout_keep_prob=dropout_holder) logits_dropout = tf.nn.dropout(logits, dropout_holder) # To change 400 classes to the ucf101 or hdmb classes fc_out = tf.layers.dense(logits_dropout, _CLASS_NUM[dataset], use_bias=True) # compute the top-k results for the whole batch size is_in_top_1_op = tf.nn.in_top_k(fc_out, label_holder, 1) # Loss calculation, including L2-norm variable_map = {} train_var = [] for variable in tf.global_variables(): tmp = variable.name.split('/') if tmp[0] == _SCOPE[train_data.mode] and 'dense' not in tmp[1]: variable_map[variable.name.replace(':0', '')] = variable if tmp[-1] == 'w:0' or tmp[-1] == 'kernel:0': weight_l2 = tf.nn.l2_loss(variable) tf.add_to_collection('weight_l2', weight_l2) loss_weight = tf.add_n(tf.get_collection('weight_l2'), 'loss_weight') loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label_holder, logits=fc_out)) total_loss = loss + _WEIGHT_OF_LOSS_WEIGHT * loss_weight tf.summary.scalar('loss', loss) tf.summary.scalar('loss_weight', loss_weight) tf.summary.scalar('total_loss', total_loss) # Import Pre-trainned model saver = tf.train.Saver(var_list=variable_map, reshape=True) saver2 = tf.train.Saver(max_to_keep=_SAVER_MAX_TO_KEEP) # Specific Hyperparams # steps for training: the number of steps on batch per epoch per_epoch_step = int(np.ceil(train_data.size / _BATCH_SIZE)) # global step constant global_step = _GLOBAL_EPOCH * per_epoch_step # global step counting global_index = tf.Variable(0, trainable=False) # Set learning rate schedule by hand, also you can use an auto way #boundaries = [10000, 20000, 30000, 40000, 50000] #values = [_LEARNING_RATE, 0.0008, 0.0005, 0.0003, 0.0001, 5e-5] #learning_rate = tf.train.piecewise_constant( # global_index, boundaries, values) STEP_SIZE = per_epoch_step * _STEP_SIZE_FACTOR learning_rate = clr.clr(_BASE_LR, _MAX_LR, STEP_SIZE, global_index) #learning_rate = tf.train.exponential_decay( # learning_rate=0.000001, global_step=global_index, decay_steps=1, decay_rate=1.001) tf.summary.scalar('learning_rate', learning_rate) # Optimizer set-up # FOR BATCH norm, we then use this updata_ops update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.MomentumOptimizer( learning_rate, _MOMENTUM).minimize(total_loss, global_step=global_index) sess = tf.Session() merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) sess.run(train_init_op) saver.restore(sess, _CHECKPOINT_PATHS[train_data.mode + '_imagenet']) print('----Here we start!----') print('Output wirtes to ' + log_dir) # logging.info('----Here we start!----') step = 0 # for one epoch true_count = 0 # for 20 batches tmp_count = 0 accuracy_tmp = 0 epoch_completed = 0 while step <= global_step: step += 1 #start_time = time.time() _, loss_now, loss_plus, is_in_top_1, summary = sess.run( [ optimizer, total_loss, loss_weight, is_in_top_1_op, merged_summary ], feed_dict={ dropout_holder: _DROPOUT, is_train_holder: True }) #duration = time.time() - start_time tmp = np.sum(is_in_top_1) true_count += tmp tmp_count += tmp train_writer.add_summary(summary, step) # responsible for printing relevant results '''if step % _OUTPUT_STEP == 0: accuracy = tmp_count / (_OUTPUT_STEP * _BATCH_SIZE) print('step: %-4d, loss: %-.4f, accuracy: %.3f (%.2f sec/batch)' % (step, loss_now, accuracy, float(duration))) logging.info('step: % -4d, loss: % -.4f,\ accuracy: % .3f ( % .2f sec/batch)' % (step, loss_now, accuracy, float(duration))) tmp_count = 0''' if step % per_epoch_step == 0: epoch_completed += 1 accuracy = true_count / (per_epoch_step * _BATCH_SIZE) print('Epoch%d, train accuracy: %.3f' % (epoch_completed, accuracy)) logging.info('Epoch%d, train accuracy: %.3f' % (train_data.epoch_completed, accuracy)) true_count = 0 if (step == global_step) or (step % (2 * STEP_SIZE) == 0): sess.run(test_init_op) true_count = 0 # start test process print(test_data.size) for i in range(test_data.size): # print(i,true_count) is_in_top_1 = sess.run(is_in_top_1_op, feed_dict={ dropout_holder: 1, is_train_holder: False }) true_count += np.sum(is_in_top_1) accuracy = true_count / test_data.size true_count = 0 # to ensure every test procedure has the same test size test_data.index_in_epoch = 0 print('Epoch%d, test accuracy: %.3f' % (epoch_completed, accuracy)) logging.info('Epoch%d, test accuracy: %.3f' % (train_data.epoch_completed, accuracy)) # saving the best params in test set if (epoch_completed > 0): saver2.save( sess, os.path.join(log_dir, test_data.name + '_' + train_data.mode), epoch_completed) sess.run(train_init_op) train_writer.close() sess.close()
import tensorflow as tf import i3d _START_TIME = time.time() log_file = open('preprocess_output_flow.txt', 'w') flow_data_dir = './flow_data/' all_video_names = os.listdir(flow_data_dir) total_vid_cnt = len(all_video_names) f = h5py.File('thumos14_i3d_features_flow.hdf5', 'w') with tf.variable_scope('Flow'): flow_input = tf.placeholder(tf.float32, shape=(1, None, 224, 224, 2)) flow_model = i3d.InceptionI3d(final_endpoint='Avg_pool_3d') flow_out, _ = flow_model(flow_input, is_training=False, dropout_keep_prob=1.0) flow_out = tf.squeeze(flow_out) flow_saver = tf.train.Saver(reshape=True) with tf.Session() as sess: flow_saver.restore(sess, './i3d-ucf101-rgb-flow-model/flow.ckpt') for vid_index, video_name in enumerate(all_video_names): start_time = time.time() flow_x_dir = os.path.join(flow_data_dir, video_name, 'flow_x') flow_y_dir = os.path.join(flow_data_dir, video_name, 'flow_y') flow_x_imgs = os.listdir(flow_x_dir) flow_x_imgs = sorted(flow_x_imgs, key=lambda x: int(x.split('.')[0].split('_')[-1]))
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) eval_type = FLAGS.eval_type imagenet_pretrained = FLAGS.imagenet_pretrained NUM_CLASSES = 400 if eval_type == 'rgb600': NUM_CLASSES = 600 if eval_type not in ['rgb', 'rgb600', 'flow', 'joint']: raise ValueError('Bad `eval_type`, must be one of rgb, rgb600, flow, joint') if eval_type == 'rgb600': kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)] else: kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)] if eval_type in ['rgb', 'rgb600', 'joint']: # RGB input has 3 channels. rgb_input = tf.placeholder( tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d( NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') rgb_logits, _ = rgb_model( rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': if eval_type == 'rgb600': rgb_variable_map[variable.name.replace(':0', '')[len('RGB/inception_i3d/'):]] = variable else: rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) if eval_type in ['flow', 'joint']: # Flow input has only 2 channels. flow_input = tf.placeholder( tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d( NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits') flow_logits, _ = flow_model( flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if eval_type == 'rgb' or eval_type == 'rgb600': model_logits = rgb_logits elif eval_type == 'flow': model_logits = flow_logits else: model_logits = rgb_logits + flow_logits model_predictions = tf.nn.softmax(model_logits) with tf.Session() as sess: feed_dict = {} if eval_type in ['rgb', 'rgb600', 'joint']: if imagenet_pretrained: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) else: rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type]) tf.logging.info('RGB checkpoint restored') sample_pool = os.listdir(_SAMPLE_ROOT_) sample_pool.sort() results = [] for vid_name in tqdm(sample_pool): rgb_sample = load_data(os.path.join(_SAMPLE_ROOT_, vid_name)) #tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape)) feed_dict[rgb_input] = rgb_sample # if eval_type in ['flow', 'joint']: # if imagenet_pretrained: # flow_saver.restore(sess, _CHECKPOINT_PATHS['flow_imagenet']) # else: # flow_saver.restore(sess, _CHECKPOINT_PATHS['flow']) # tf.logging.info('Flow checkpoint restored') # flow_sample = np.load(_SAMPLE_PATHS['flow']) # tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape)) # feed_dict[flow_input] = flow_sample out_logits, out_predictions = sess.run( [model_logits, model_predictions], feed_dict=feed_dict) out_logits = out_logits[0] out_predictions = out_predictions[0] results.append(out_predictions) plot_results(results, ap_num=4, full_range=360, grad=15, variable='ForTest', class_id=260) gt_labels = [260]*len(results) top1, top5 = top_k_accuracy(results, gt_labels, k=(1, 5)) print("Top-1 Accuracy = {:.02f}".format(top1 * 100)) print("Top-5 Accuracy = {:.02f}".format(top5 * 100))