def optimizing(optimizer, final_loss, clip_gradient_norm, global_step, prefix, scope): # Accumulate several batches before gradient descent options # to make larger batch than the memory could be able to hold tvs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) # tvs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) print "tvs", tvs if FLAGS.accumulate_gradients: assert FLAGS.apply_every_n_batches > 0, "apply_every_n_batches should be > 0" scale = 1.0 / FLAGS.apply_every_n_batches accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs] init_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars] gvs = optimizer.compute_gradients(final_loss, tvs) accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs)] if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): clipped_accum_vars = utils.clip_variable_norms(accum_vars, max_norm = clip_gradient_norm, scale = scale) apply_op = optimizer.apply_gradients([(clipped_accum_vars[i], gv[1]) for i, gv in enumerate(gvs)], global_step=global_step) else: apply_op = optimizer.apply_gradients([(accum_vars[i] * scale, gv[1]) for i, gv in enumerate(gvs)], global_step=global_step) tf.get_collection_ref(prefix + "_train/init_ops").extend(init_ops) tf.get_collection_ref(prefix + "_train/accum_ops").extend(accum_ops) tf.add_to_collection(prefix + "_train/apply_op", apply_op) # the original way, apply every batch else: gradients = optimizer.compute_gradients(final_loss, colocate_gradients_with_ops=False, var_list=tvs) print gradients if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) tf.add_to_collection(prefix + "_train/train_op", train_op) return None
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): result = model.create_model(tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i]) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def build_graph(all_readers, all_train_data_patterns, input_reader, input_data_pattern, model, label_loss_fn=losses.CrossEntropyLoss(), batch_size=256, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: all_readers: The data file readers. Every element in it should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_patterns: glob paths to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) original_input = None if input_data_pattern is not None: original_video_id, original_input, unused_labels_batch, unused_num_frames = ( get_input_data_tensors(input_reader, input_data_pattern, batch_size=batch_size, num_epochs=num_epochs)) optimizer = optimizer_class(learning_rate) model_input_raw_tensors = [] labels_batch_tensor = None for reader, data_pattern in zip(all_readers, all_train_data_patterns): video_id, model_input_raw, labels_batch, unused_num_frames = ( get_input_data_tensors(reader, data_pattern, batch_size=batch_size, num_epochs=num_epochs)) if labels_batch_tensor is None: labels_batch_tensor = labels_batch model_input_raw_tensors.append(tf.expand_dims(model_input_raw, axis=2)) if original_input is not None: id_match = tf.ones_like(original_video_id, dtype=tf.float32) id_match = id_match * tf.cast( tf.equal(original_video_id, video_id), dtype=tf.float32) tf.summary.scalar("model/id_match", tf.reduce_mean(id_match)) model_input = tf.concat(model_input_raw_tensors, axis=2) labels_batch = labels_batch_tensor tf.summary.histogram("model/input", model_input) with tf.name_scope("model"): if FLAGS.noise_level > 0: noise_level_tensor = tf.placeholder_with_default( 0.0, shape=[], name="noise_level") else: noise_level_tensor = None if FLAGS.dropout: keep_prob_tensor = tf.placeholder_with_default(1.0, shape=[], name="keep_prob") result = model.create_model(model_input, labels=labels_batch, vocab_size=reader.num_classes, original_input=original_input, dropout=FLAGS.dropout, keep_prob=keep_prob_tensor, noise_level=noise_level_tensor) else: result = model.create_model(model_input, labels=labels_batch, vocab_size=reader.num_classes, original_input=original_input, noise_level=noise_level_tensor) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: video_weights_batch = None if FLAGS.reweight: video_weights_batch = get_video_weights(video_id) else: video_weights_batch = None if FLAGS.multitask: print "using multitask loss" support_predictions = result["support_predictions"] tf.summary.histogram("model/support_predictions", support_predictions) print "support_predictions", support_predictions label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, labels_batch, weights=video_weights_batch) else: print "using original loss" label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, weights=video_weights_batch) tf.summary.histogram("model/predictions", predictions) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss if FLAGS.training: gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms( gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) else: train_op = tf.no_op() tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op) if FLAGS.dropout: tf.add_to_collection("keep_prob", keep_prob_tensor) if FLAGS.noise_level > 0: tf.add_to_collection("noise_level", noise_level_tensor)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, transformer_class=feature_transform.DefaultTransformer, augmenter_class=data_augmentation.DefaultAugmenter, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) if FLAGS.distillation_features: video_id, model_input_raw, labels_batch, num_frames, distill_labels_batch = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) if FLAGS.distillation_features and FLAGS.distillation_type == 2: p = FLAGS.distillation_percent print "distillation_percent =", p, "reforming labels" float_labels = tf.cast(labels_batch, dtype=tf.float32) sum_float_labels = tf.reduce_sum(float_labels, axis=1, keep_dims=True) sum_distill_labels = tf.reduce_sum( distill_labels_batch, axis=1, keep_dims=True) + 1e-6 distill_labels_batch = float_labels + distill_labels_batch * ( sum_float_labels / sum_distill_labels * p) distill_labels_batch = tf.clip_by_value(distill_labels_batch, clip_value_min=0.0, clip_value_max=1.0) else: video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) # data augmentation, will not persist in inference data_augmenter = augmenter_class() model_input_raw, labels_batch, num_frames = data_augmenter.augment( model_input_raw, num_frames=num_frames, labels_batch=labels_batch) tf.summary.histogram("model/input_raw", model_input_raw) feature_transformer = transformer_class() model_input, num_frames = feature_transformer.transform( model_input_raw, num_frames=num_frames) tf.summary.histogram("model/input", model_input) with tf.name_scope("model"): if FLAGS.noise_level > 0: noise_level_tensor = tf.placeholder_with_default( 0.0, shape=[], name="noise_level") else: noise_level_tensor = None if FLAGS.distillation_as_input: distillation_predictions = distill_labels_batch else: distillation_predictions = None if FLAGS.dropout: keep_prob_tensor = tf.placeholder_with_default(1.0, shape=[], name="keep_prob") result = model.create_model( model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, dropout=FLAGS.dropout, keep_prob=keep_prob_tensor, distillation_predictions=distillation_predictions, noise_level=noise_level_tensor) else: result = model.create_model( model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, distillation_predictions=distillation_predictions, noise_level=noise_level_tensor) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) print "result", result predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: video_weights_batch = None if FLAGS.reweight: video_weights_batch = get_video_weights(video_id) if FLAGS.distillation_as_boosting: video_weights_batch = get_weights_by_predictions( labels_batch, distillation_predictions) if FLAGS.multitask: support_predictions = result["support_predictions"] tf.summary.histogram("model/support_predictions", support_predictions) print "support_predictions", support_predictions if FLAGS.distillation_features and FLAGS.distillation_type == 1: p = FLAGS.distillation_percent print "distillation_percent =", p if p <= 0: label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, labels_batch, weights=video_weights_batch) elif p >= 1: label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) else: label_loss = label_loss_fn.calculate_loss(predictions, support_predictions, labels_batch, weights=video_weights_batch) * (1.0 - p) \ + label_loss_fn.calculate_loss(predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) * p elif FLAGS.distillation_features and FLAGS.distillation_type == 2: print "using pure distillation loss" label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) else: print "using original loss" label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, labels_batch, weights=video_weights_batch) else: if FLAGS.distillation_features and FLAGS.distillation_type == 1: p = FLAGS.distillation_percent print "distillation_percent =", p if p <= 0: label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, weights=video_weights_batch) elif p >= 1: label_loss = label_loss_fn.calculate_loss( predictions, distill_labels_batch, weights=video_weights_batch) else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch, weights=video_weights_batch) * (1.0 - p) \ + label_loss_fn.calculate_loss(predictions, distill_labels_batch, weights=video_weights_batch) * p elif FLAGS.distillation_features and FLAGS.distillation_type == 2: print "using pure distillation loss" label_loss = label_loss_fn.calculate_loss( predictions, distill_labels_batch, weights=video_weights_batch) else: print "using original loss" label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, weights=video_weights_batch) tf.summary.histogram("model/predictions", predictions) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op) if FLAGS.dropout: tf.add_to_collection("keep_prob", keep_prob_tensor) if FLAGS.noise_level > 0: tf.add_to_collection("noise_level", noise_level_tensor)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.compat.v1.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == "GPU"] gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = "/gpu:%d" else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = "/cpu:%d" learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar("learning_rate", learning_rate) optimizer = optimizer_class(learning_rate) input_data_dict = (get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) print('input_data_dict', input_data_dict) model_input_raw = input_data_dict["video_matrix"] labels_batch = input_data_dict["labels"] num_frames = input_data_dict["num_frames"] print("model_input_shape, ", model_input_raw.shape) print("labels_batch, ", labels_batch) import csv import urllib3 import numpy as np import pandas as pd whitelisted_cls_mask = np.zeros((3862, ), dtype=np.float32) url = pd.read_csv('segment_label_ids.csv') # response = urllib2.urlopen(url) for line in url: try: cls_id = int(line[0]) whitelisted_cls_mask[cls_id] = 1. except ValueError: # Simply skip the non-integer line. continue #response.close() # url2 = 'http://storage.googleapis.com/youtube8m-lijun-mlengine/classCount.csv' # response2 = urllib2.urlopen(url2) # fobj2 = csv.reader(response2) # for line in fobj2: # try: # cls_id = int(line[0]) # whitelisted_cls_mask[cls_id] = (15-np.log(int(line[1])))**2 # except ValueError: # # Simply skip the non-integer line. # continue # response2.close() # select=tf.matmul(tf.cast(labels_batch, tf.float32),tf.reshape(whitelisted_cls_mask,[3862,1]))>0 # select=tf.squeeze(select) # model_input_raw = model_input_raw[select,:,:] # labels_batch = labels_batch[select,:] # num_frames = num_frames[select] tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] # import csv # import urllib2 # import numpy as np # whitelisted_cls_mask = np.zeros((3862,), # dtype=np.float32) # url = 'http://storage.googleapis.com/youtube8m-lijun-mlengine/segment_label_ids.csv' # response = urllib2.urlopen(url) # fobj = csv.reader(response) # for line in fobj: # try: # cls_id = int(line[0]) # whitelisted_cls_mask[cls_id] = 1. # except ValueError: # # Simply skip the non-integer line. # continue # response.close() # whitelisted_cls_mask=whitelisted_cls_mask+np.ones((3862,),dtype=np.float32) whitelisted_cls_mask = whitelisted_cls_mask * 4 + np.ones( (3862, ), dtype=np.float32) # whitelisted_cls_mask=0.05*(whitelisted_cls_mask*99+np.ones((3862,),dtype=np.float32)) # print('whitelisted_cls_mask',np.amin(whitelisted_cls_mask)) for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested.f with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): # reuse=True if i > 0 else None with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): result = model.create_model(tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) # print('result predictions',result["predictions"]) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i], label_weights=whitelisted_cls_mask) if "aux_predictions" in result.keys(): for pred in result["aux_predictions"]: label_loss += label_loss_fn.calculate_loss( pred, tower_labels[i], label_weights=whitelisted_cls_mask) # print('label_loss',label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope("clip_grads"): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def build_graph(reader, model, train_data_pattern, train_data_pattern2, train_data_pattern3, eval_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None, l2_penalty=1e-8, gpu_only=1): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ # data files files1 = gfile.Glob(train_data_pattern) files2 = gfile.Glob(train_data_pattern2) files3 = gfile.Glob(train_data_pattern3) files = files1 + files2 + files3 if not files: raise IOError("Unable to find training files. data_pattern='" + data_pattern + "'.") logging.info("Total number of training files: %s + %s + %s = %s.", str(len(files1)), str(len(files2)), str(len(files3)), str(len(files))) files4 = gfile.Glob(eval_data_pattern) logging.info("Total number of eval files: %s.", str(len(files4))) if FLAGS.fold == -1: validate_files = files4 train_files = files else: validate_files = files[FLAGS.fold::5] train_files = [x for x in files if x not in validate_files] logging.info("train files: {}, first is: {}.".format( len(train_files), train_files[0].split('/')[-1])) logging.info("eval files: {}, first is: {}.".format( len(validate_files), validate_files[0].split('/')[-1])) # label weights for loss function. ugly hard coded for now. wgts_np = np.ones(FLAGS.truncated_num_classes) over_weight_labels = False if over_weight_labels: labels_to_overwgt = [ 38, 47, 49, 55, 72, 76, 86, 89, 93, 94, 95, 98, 99, 101, 102, 110, 111, 113, 114, 115, 120, 121 ] wgts_np[labels_to_overwgt] = 2.0 wgts_4_lossfn = tf.constant(wgts_np, dtype=tf.float32) global_step = tf.Variable(0, trainable=False, name="global_step") restart_learning_rate = tf.Variable(base_learning_rate, trainable=False, name="restart_learning_rate") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(restart_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_files, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) # model params # probabilities for keeping a neuron in a layer, assuming max 10 layers, below default value with tf.variable_scope("tower", reuse=True) as scope: layers_keep_probs = tf.Variable( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], trainable=False, name="layers_keep_probs") model_input = model_input_raw if FLAGS.apply_global_normalization: g_mean, g_std = model_utils.load_global_moments() g_inv_std = 1.0 / g_std global_mean = tf.constant(g_mean, dtype=tf.float32) # expand global mean to match new dimension and fill rest with zeros new_dim = tf.cast(model_input_raw.shape[1], tf.int32) zero_padding = tf.zeros(new_dim - tf.shape(global_mean), tf.float32) global_mean_padded = tf.concat([global_mean, zero_padding], 0) # expand global inv std to match new dimension and fill rest with ones global_inv_std = tf.constant(g_inv_std, dtype=tf.float32) one_padding = tf.ones(new_dim - tf.shape(global_inv_std), tf.float32) global_inv_std_padded = tf.concat([global_inv_std, one_padding], 0) # apply normalizations (can do both) if requested # global L2 normalization model_input = tf.multiply(tf.subtract(model_input, global_mean_padded), global_inv_std_padded) # regular L2 normalization if FLAGS.apply_batch_l2_normalization: feature_dim = len(model_input.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] # eval graph - to monitor performance out of sample during training e_video_id, e_input_raw, e_labels_batch, e_num_frames = ( get_input_data_tensors(reader, validate_files, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=2 * num_epochs)) e_input = e_input_raw if FLAGS.apply_global_normalization: e_input = tf.multiply(tf.subtract(e_input, global_mean_padded), global_inv_std_padded) if FLAGS.apply_batch_l2_normalization: feature_dim = len(model_input.get_shape()) - 1 e_input = tf.nn.l2_normalize(e_input, feature_dim) e_tower_inputs = tf.split(e_input, num_towers) e_tower_labels = tf.split(e_labels_batch, num_towers) e_tower_num_frames = tf.split(e_num_frames, num_towers) e_tower_predictions = [] e_tower_layers_keep_probs = tf.Variable( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], trainable=False, name="layers_keep_probs") logging.info(e_tower_inputs) # end eval for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. logging.info('For tower: ' + str(i)) with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): logging.info(layers_keep_probs) result = model.create_model( tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i], layers_keep_probs=layers_keep_probs, l2_penalty=l2_penalty, is_training=True) for variable in slim.get_model_variables(): logging.info(variable) tf.summary.histogram(variable.op.name, variable) # create shadow moving average model variables if FLAGS.use_ema == True: model_vars = [x for x in slim.get_model_variables()] ema = tf.train.ExponentialMovingAverage( decay=1.0 - 1.0 / FLAGS.ema_halflife) ema_op = ema.apply(model_vars) logging.info("model_vars:") logging.info(" || ".join([str(x) for x in model_vars])) ema_vars = [ema.average(x) for x in model_vars] ema_vars_pair_dict = { ema.average_name(x): x.op.name for x in model_vars } logging.info("ema_vars_pair_dict:") for x, y in ema_vars_pair_dict.items(): logging.info(x + ': ' + y) for v in ema_vars: tf.summary.histogram(v.op.name, v) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) tf.add_to_collection("ema_op", ema_op) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i], FLAGS.loss_epsilon) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) # eval ops logging.info("eval ops") e_result = model.create_model( e_tower_inputs[i], num_frames=e_tower_num_frames[i], vocab_size=reader.num_classes, labels=e_tower_labels[i], layers_keep_probs= e_tower_layers_keep_probs, #tf.Variable([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], tf.float32, name="layers_keep_probs") l2_penalty=l2_penalty, is_training=False) e_predictions = e_result["predictions"] e_tower_predictions.append(e_predictions) # end eval ops label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("restart_learning_rate", restart_learning_rate) tf.add_to_collection("layers_keep_probs", layers_keep_probs) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op) #tf.add_to_collection("ema_op", ema_op) # add eval graph e_label_loss = label_loss_fn.calculate_loss( tf.concat(e_tower_predictions, 0), e_labels_batch, FLAGS.loss_epsilon) tf.summary.scalar("e_label_loss", e_label_loss) tf.add_to_collection("e_predictions", tf.concat(e_tower_predictions, 0)) tf.add_to_collection("e_labels", tf.cast(e_labels_batch, tf.float32)) tf.add_to_collection("e_loss", e_label_loss)
def model_fn(features, labels, mode, params): is_training = mode == learn.ModeKeys.TRAIN optimizer_class = find_class_by_name(params.optimizer, [tf.train]) label_loss_fn = find_class_by_name(params.label_loss, [losses])() model = find_class_by_name(params.model, [frame_level_models, video_level_models])() global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( params.base_learning_rate, global_step * params.batch_size * params.num_towers, params.learning_rate_decay_examples, params.learning_rate_decay, staircase=True, ) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) tf.summary.histogram("model/input_raw", features['model_input']) feature_dim = len(features['model_input'].get_shape()) - 1 model_input = tf.nn.l2_normalize(features['model_input'], feature_dim) tower_inputs = tf.split(model_input, params.num_towers) if mode == learn.ModeKeys.INFER: # *** # this is a quick hack so that the existing model_fn code, # taken from train.py, doesn't break in inference (or serving) mode. # Normally, we would write model_fn such that the 'labels' input arg # can be None in inference mode, but this existing model code was not written this # way. See the serving_input_fn() defined below, to see where 'labels_batch' # is added to the features dict, just to make this code work properly labels = features['labels_batch'] tower_labels = tf.split(labels, params.num_towers) tower_num_frames = tf.split(features['num_frames'], params.num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] for i in range(params.num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(params.device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope([slim.model_variable, slim.variable], device="/cpu:0" if params.num_gpus != 1 else "/gpu:0")): result = model.create_model( tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=params.reader.num_classes, labels=tower_labels[i], is_training=is_training) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i]) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) final_loss = params.regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) pred_dict = {} label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) predictions = tf.concat(tower_predictions, 0) pred_dict['predictions'] = predictions tf.summary.scalar("label_loss", label_loss) if params.regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) if is_training: # Incorporate the L2 weight penalties, etc. merged_gradients = utils.combine_gradients(tower_gradients) if params.clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, params.clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) else: train_op = None eval_metric_ops = {} if mode == learn.ModeKeys.EVAL or is_training: eval_metric_ops['hit_at_one'] = metrics.streaming_mean( tf.py_func( lambda x, y: np.float32(eval_util.calculate_hit_at_one(x, y)), [predictions, labels], tf.float32, stateful=False, )) eval_metric_ops['perr'] = metrics.streaming_mean( tf.py_func( lambda x, y: np.float32( eval_util.calculate_precision_at_equal_recall_rate(x, y)), [predictions, labels], tf.float32, stateful=False, )) eval_metric_ops['gap'] = metrics.streaming_mean( tf.py_func( lambda x, y: np.float32(eval_util.calculate_gap(x, y)), [predictions, labels], tf.float32, stateful=False, )) else: pass top_predictions, top_indices = tf.nn.top_k(predictions, _TOP_PREDICTIONS_IN_OUTPUT) pred_dict['top_predictions'] = top_predictions pred_dict['top_indices'] = top_indices #add eval summaries and update ops for training for key, val in eval_metric_ops.items(): tf.summary.scalar(key, val[0]) #create summary for each eval op tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, val[1] ) # add the update op for each eval up to update ops collection, so that it will be run every train_op call # tf.add_to_collection("global_step", global_step) # tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) # tf.add_to_collection("input_batch_raw", model_input_raw) # tf.add_to_collection("input_batch", model_input) # tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels, tf.float32)) # tf.add_to_collection("train_op", train_op) tf.summary.scalar("loss", label_loss) export_outputs = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(pred_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=pred_dict, loss=label_loss, train_op=train_op, export_outputs=export_outputs, eval_metric_ops=eval_metric_ops)
def build_graph(reader, generator_model, discriminator_model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. generator_model: The core model for generator. It should inherit from BaseModel. discriminator_model: The core model for discriminator. It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") gpus = get_gpus() num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) model_input_raw, _ = (get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) model_input = model_input_raw noise_input = tf.placeholder( tf.float32, shape=[None, random_noise_generator.get_dim()]) image_width, image_height = reader.get_image_size() tower_inputs = tf.split(model_input, num_towers) tower_noise_input = tf.split(noise_input, num_towers) tower_D_gradients = [] tower_G_gradients = [] tower_generated_images = [] tower_predictions_for_fake = [] tower_predictions_for_real = [] tower_D_losses = [] tower_G_losses = [] for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): generator_model.create_model(image_width * image_height) discriminator_model.create_model(image_width * image_height) generated_result = generator_model.run_model( tower_noise_input[i]) generated_images = generated_result["output"] generated_images_shaped = tf.reshape( generated_images, [-1, image_height, image_width, 1]) tf.summary.image('generated_images', generated_images_shaped, 10) tower_generated_images.append(generated_images) result_from_fake = discriminator_model.run_model( generated_images) result_from_real = discriminator_model.run_model( tower_inputs[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions_for_fake = result_from_fake["predictions"] predictions_for_real = result_from_real["predictions"] tower_predictions_for_fake.append(predictions_for_fake) tower_predictions_for_real.append(predictions_for_real) logits_for_fake = result_from_fake["logits"] logits_for_real = result_from_real["logits"] D_loss_fake = label_loss_fn.calculate_loss( logits_for_fake, tf.zeros_like(logits_for_fake)) D_loss_real = label_loss_fn.calculate_loss( logits_for_real, tf.ones_like(logits_for_real)) D_loss = D_loss_fake + D_loss_real tower_D_losses.append(D_loss) G_loss = label_loss_fn.calculate_loss( logits_for_fake, tf.ones_like(logits_for_fake)) tower_G_losses.append(G_loss) D_var = discriminator_model.get_variables() D_gradients = optimizer.compute_gradients(D_loss, var_list=D_var) tower_D_gradients.append(D_gradients) G_var = generator_model.get_variables() G_gradients = optimizer.compute_gradients(G_loss, var_list=G_var) tower_G_gradients.append(G_gradients) D_loss = tf.reduce_mean(tf.stack(tower_D_losses)) G_loss = tf.reduce_mean(tf.stack(tower_G_losses)) tf.summary.scalar("D_loss", D_loss) tf.summary.scalar("G_loss", G_loss) merged_D_gradients = utils.combine_gradients(tower_D_gradients) merged_G_gradients = utils.combine_gradients(tower_G_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_D_gradients = utils.clip_gradient_norms( merged_D_gradients, clip_gradient_norm) merged_G_gradients = utils.clip_gradient_norms( merged_G_gradients, clip_gradient_norm) # Attach global_step only once so that it will be increased by 1. D_train_op = optimizer.apply_gradients(merged_D_gradients) G_train_op = optimizer.apply_gradients(merged_G_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("D_loss", D_loss) tf.add_to_collection("G_loss", G_loss) tf.add_to_collection("p_for_fake", tf.concat(tower_predictions_for_fake, 0)) tf.add_to_collection("p_for_data", tf.concat(tower_predictions_for_real, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("generated_images", tf.concat(tower_generated_images, 0)) tf.add_to_collection("D_train_op", D_train_op) tf.add_to_collection("G_train_op", G_train_op) tf.add_to_collection("noise_input_placeholder", noise_input)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): result = model.create_model(tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i]) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = [] for filename in self.train_dir.split(','): logging.info("filename:%s", str(filename)) meta_filename.append( self.get_meta_filename(start_new_model, filename)) label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train]) local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' # build_graph_retrain( # reader=self.reader, # model=self.model, # train_data_pattern=FLAGS.train_data_pattern, # label_loss_fn=label_loss_fn, # num_readers=FLAGS.num_readers, # batch_size=FLAGS.batch_size) # with tf.variable_scope("net2"): #### global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay( FLAGS.base_learning_rate, global_step * FLAGS.batch_size * num_towers, FLAGS.learning_rate_decay_examples, FLAGS.learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) video_id_batch, model_input_raw, labels_batch, num_frames = get_input_data_tensors( # pylint: disable=g-line-too-long self.reader, FLAGS.train_data_pattern, batch_size=FLAGS.batch_size, num_readers=FLAGS.num_readers) tf.summary.histogram("model_input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 # Normalize input features. model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) # with tf.variable_scope("net1"): with tf.variable_scope("tower"): result1 = self.model[0].create_model( model_input, num_frames=num_frames, vocab_size=self.reader.num_classes, is_training=False) ##### result1 = tf.stop_gradient(result1) result2 = self.model[1].create_model( model_input, num_frames=num_frames, vocab_size=self.reader.num_classes, labels=labels_batch, is_training=False) result2 = tf.stop_gradient(result2) all_vars = tf.global_variables() # for v in all_vars: # print v.name # for i in v_vars: # logging.info(str(i)) for i, v in enumerate(all_vars): logging.info(str(v.name)) if 'rnn' in v.name: vars1 = all_vars[:i] vars2 = all_vars[i:] break # v_vars0 = [v for v in all_vars if v.name == 'tower/input_bn/beta:0' # or v.name == 'tower/input_bn/gamma:0' # or v.name == 'tower/input_bn/beta:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0'] # v_vars = [v for v in all_vars if v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/weights:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0'] result1 = tf.nn.l2_normalize(result1, dim=1) result2 = tf.nn.l2_normalize(result2, dim=1) embeddings = tf.concat([result1, result2], axis=1) model_concat = find_class_by_name('MoeModel', [video_level_models])() result = model_concat.create_model( embeddings, vocab_size=self.reader.num_classes, num_mixtures=4) predictions = result["predictions"] # predictions=(result1["predictions"]+result2["predictions"])/2 tf.summary.histogram("model_activations", predictions) # if "loss" in result.keys(): # label_loss = result["loss"] # else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] reg_losses = tf.losses.get_regularization_losses() if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) final_loss = FLAGS.regularization_penalty * reg_loss + label_loss optimizer = optimizer_class(learning_rate) gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms(gradients, 1.0) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("video_id_batch", video_id_batch) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("summary_op", tf.summary.merge_all()) tf.add_to_collection("train_op", train_op) video_id_batch = tf.get_collection("video_id_batch")[0] prediction_batch = tf.get_collection("predictions")[0] label_batch = tf.get_collection("labels")[0] loss = tf.get_collection("loss")[0] summary_op = tf.get_collection("summary_op")[0] # saver = tf.train.Saver(tf.global_variables()) # saver=tf.train.Saver(result1) summary_writer = tf.summary.FileWriter( FLAGS.ensemble_dir, graph=tf.get_default_graph()) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: train_dirs = FLAGS.train_dir.split(',') latest_checkpoint0 = tf.train.latest_checkpoint(train_dirs[0]) latest_checkpoint1 = tf.train.latest_checkpoint(train_dirs[1]) sess.run(tf.global_variables_initializer()) if latest_checkpoint0: logging.info("Loading checkpoint for eval: " + latest_checkpoint0) saver1 = tf.train.Saver(vars1) saver1.restore(sess, latest_checkpoint0) if latest_checkpoint1: saver2 = tf.train.Saver(vars2) logging.info("Loading checkpoint for eval: " + latest_checkpoint1) saver2.restore(sess, latest_checkpoint1) saver = tf.train.Saver() fetches = [ learning_rate, global_step, train_op, video_id_batch, prediction_batch, label_batch, loss, summary_op ] coord = tf.train.Coordinator() threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) while not coord.should_stop(): # batch_start_time = time.time() learning_rate_val, global_step_val, _, vid_val, predictions_val, labels_val, loss_val, summary_val = sess.run( fetches) # hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) # perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, # labels_val) # gap = eval_util.calculate_gap(predictions_val, labels_val) # logging.info( "training step " + str(global_step_val)+" | Loss: " + ("%.2f" % loss_val) +" | Hit@1: " + # ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) + # " GAP: " + ("%.4f" % gap)) if self.is_master and global_step_val % self.disp_batches == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + "| learning rate: " + ("%.4f" % learning_rate_val) + " | Loss: " + ("%.2f" % loss_val) + " | Hit@1: " + ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) + " GAP: " + ("%.4f" % gap)) summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/loss", loss_val), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/lr", learning_rate_val), global_step_val) summary_writer.flush() if global_step_val % FLAGS.export_model_steps == 0: saver.save(sess, FLAGS.ensemble_dir, global_step=global_step_val) coord.request_stop() coord.join(threads, stop_grace_period_secs=10)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == "GPU"] print(gpus) gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = "/gpu:%d" else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = "/cpu:%d" learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar("learning_rate", learning_rate) if clip_gradient_norm > 0.: optimizer = optimizer_class(learning_rate) else: optimizer = optimizer_class(learning_rate) input_data_dict = (get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) model_input_raw = input_data_dict["video_matrix"] labels_batch = input_data_dict["labels"] num_frames = input_data_dict["num_frames"] print("model_input_shape, ", model_input_raw.shape) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 offset = np.array([4. / 512] * 1024 + [0] * 128) offset = tf.constant(offset, dtype=tf.float32) eigen_val = tf.constant(np.sqrt( np.load("yt8m_pca/eigenvals.npy")[:1024, 0]), dtype=tf.float32) model_input = tf.multiply( model_input_raw - offset, tf.pad(eigen_val + 1e-4, [[0, 128]], constant_values=1.)) # model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) if FLAGS.segment_labels: label_weights = input_data_dict["label_weights"] else: label_weights = None tower_logits = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] print("flag1!!!!", device_string) for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with tf.variable_scope("tower_%d" % i, reuse=False): result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, is_training=True) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) logits = result["logits"] tower_logits.append(logits) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, label_weights=label_weights) if "aux_predictions" in result.keys(): for pred in result["aux_predictions"]: label_loss += label_loss_fn.calculate_loss( pred, labels_batch, label_weights=label_weights) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) with tf.device("/gpu:%d" % 0): with tf.variable_scope("ensemble"): ftr_mean = tf.reduce_mean(model_input, axis=1) print("ftr mean shape: ", ftr_mean.get_shape().as_list()) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=False, is_training=True, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, num_towers, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) logits = tf.stack(tower_logits, axis=1) final_logit = tf.reduce_sum(tf.multiply( logits, tf.expand_dims(mix_weights, axis=-1)), axis=1, keepdims=False) final_predictions = tf.nn.sigmoid(final_logit) print("flag2!!!", FLAGS.final_temperature, FLAGS.final_lambda) rank_pred = tf.expand_dims(tf.nn.softmax(tf.div( final_logit, FLAGS.final_temperature), axis=-1), axis=1) aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.final_temperature), axis=-1) epsilon = 1e-8 kl_loss = tf.reduce_sum( rank_pred * (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)), axis=-1) regularization_loss = FLAGS.final_lambda * tf.reduce_mean( tf.reduce_sum(kl_loss, axis=-1), axis=-1) final_label_loss = label_loss_fn.calculate_loss( final_predictions, labels_batch, label_weights=label_weights) label_loss = tf.reduce_sum( tf.stack(tower_label_losses)) + final_label_loss tf.summary.scalar("label_loss", label_loss) reg_loss = tf.reduce_sum( tf.stack(tower_reg_losses)) + regularization_loss tf.summary.scalar("reg_loss", reg_loss) final_loss = label_loss + regularization_penalty * reg_loss gradients = optimizer.compute_gradients(final_loss, colocate_gradients_with_ops=True) if clip_gradient_norm > 0: gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) final_train_op = optimizer.apply_gradients(gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", final_predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", final_train_op)
def main(): env = json.loads(os.environ.get("TF_CONFIG", "{}")) task_data = env.get("task", None) or {"type": "master", "index": 0} task = type("TaskSpec", (object, ), task_data) logging.set_verbosity(tf.logging.INFO) logging.info("%s: Tensorflow version: %s.", task_as_string(task), tf.__version__) video_ids, video_features, video_labels, video_frames = gen_input( data_pattern, reader_batch_size=reader_batch_size, num_classes=num_classes, num_readers=num_readers, mini_batch_size=mini_batch_size) result = gen_model(model_input=video_features, vocab_size=num_classes, labels=video_labels, num_frames=video_frames) predictions = result["predictions"] global_step = tf.Variable(0, trainable=False, name="global_step") label_loss = label_loss_fn.calculate_loss(predictions, video_labels) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) final_loss = regularization_penalty * reg_loss + label_loss learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * mini_batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) gradients = optimizer.compute_gradients(final_loss, colocate_gradients_with_ops=False) tf.summary.scalar("label_loss", label_loss) tf.summary.scalar("reg_loss", reg_loss) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) with tf.Session() as sess: tf.global_variables_initializer().run() tf.local_variables_initializer().run() #init_local_op = tf.local_variables_initializer() #sess.run(init_local_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) total_step = 0 try: while total_step < 100000: batch_start_time = time.time() # v_ids, v_features, v_labels, v_frames = sess.run([video_ids, video_features, video_labels, video_frames]) _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [ train_op, global_step, label_loss, predictions, tf.cast(video_labels, tf.float32) ]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch # if max_steps <= global_step_val: # max_steps_reached = True # print(v_features.shape) # print(v_ids) if total_step % 10 == 0: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) total_step = total_step + 1 except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(task)) coord.request_stop() coord.join(threads)
def build_graph(reader, model, train_data_list, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=16, base_learning_rate=0.01, learning_rate_decay_examples=4000, learning_rate_decay=0.99, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, prediction_threshold=0.5, regularization_penalty=1, num_readers=2, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") if FLAGS.accumulate_gradients: actual_batch_size = batch_size * FLAGS.apply_every_n_batches else: actual_batch_size = batch_size learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * actual_batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) image_id, image_data, image_mask = (get_input_data_tensors( reader, train_data_list, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) model_input = image_data tf.summary.histogram("model/input", model_input) with tf.name_scope("model"): result = model.create_model(model_input, l2_penalty=FLAGS.l2_penalty) print "result", result for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: if FLAGS.multitask: support_predictions = result["support_predictions"] tf.summary.histogram("model/support_predictions", support_predictions) print "support_predictions", support_predictions label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, image_mask) else: label_loss = label_loss_fn.calculate_loss( predictions, image_mask) tf.summary.histogram("model/predictions", predictions) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss # Accumulate several batches before gradient descent options # to make larger batch than the memory could be able to hold if FLAGS.accumulate_gradients: assert FLAGS.apply_every_n_batches > 0, "apply_every_n_batches should be > 0" scale = 1.0 / FLAGS.apply_every_n_batches tvs = tf.trainable_variables() accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs ] init_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars] gvs = optimizer.compute_gradients(final_loss, tvs) accum_ops = [ accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs) ] if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): clipped_accum_vars = utils.clip_variable_norms( accum_vars, max_norm=clip_gradient_norm, scale=scale) apply_op = optimizer.apply_gradients( [(clipped_accum_vars[i], gv[1]) for i, gv in enumerate(gvs)], global_step=global_step) else: apply_op = optimizer.apply_gradients( [(accum_vars[i] * scale, gv[1]) for i, gv in enumerate(gvs)], global_step=global_step) tf.get_collection_ref("train/init_ops").extend(init_ops) tf.get_collection_ref("train/accum_ops").extend(accum_ops) tf.add_to_collection("train/apply_op", apply_op) # the original way, apply every batch else: gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms( gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) tf.add_to_collection("train/train_op", train_op) labels = tf.cast(image_mask, tf.int32) float_labels = tf.cast(image_mask, tf.float32) auc, _ = tf.metrics.auc(labels, predictions, num_thresholds=40) bool_predictions = tf.greater(predictions, prediction_threshold) true_pos = tf.cast( tf.reduce_sum( tf.cast(labels > 0, tf.int32) * tf.cast(predictions > prediction_threshold, tf.int32)), tf.float32) false_pos = tf.cast( tf.reduce_sum( tf.cast(labels <= 0, tf.int32) * tf.cast(predictions > prediction_threshold, tf.int32)), tf.float32) false_neg = tf.cast( tf.reduce_sum( tf.cast(labels > 0, tf.int32) * tf.cast(predictions <= prediction_threshold, tf.int32)), tf.float32) mean_iou = (2.0 * true_pos + 1e-7) / (2 * true_pos + false_pos + false_neg + 1e-7) print mean_iou num_examples = tf.shape(labels)[0] tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("id_batch", image_id) tf.add_to_collection("predictions", predictions) tf.add_to_collection("model_input", model_input) tf.add_to_collection("num_examples", num_examples) tf.add_to_collection("labels", labels) tf.add_to_collection("float_labels", float_labels) tf.add_to_collection("bool_predictions", bool_predictions) tf.add_to_collection("auc", auc) tf.add_to_collection("mean_iou", mean_iou)