def testFlopRegularizer(self): tf.reset_default_graph() tf.set_random_seed(7907) with slim.arg_scope([slim.layers.conv2d, slim.layers.conv2d_transpose], weights_initializer=tf.random_normal_initializer): # Our test model is: # # -> conv1 --+ # / |--[concat] # image --> conv2 --+ # \ # -> convt # # (the model has two "outputs", convt and concat). # image = tf.constant(0.0, shape=[1, 17, 19, NUM_CHANNELS]) conv1 = slim.layers.conv2d(image, 13, [7, 5], padding='SAME', scope='conv1') conv2 = slim.layers.conv2d(image, 23, [1, 1], padding='SAME', scope='conv2') self.concat = tf.concat([conv1, conv2], 3) self.convt = slim.layers.conv2d_transpose(image, 29, [7, 5], stride=3, padding='SAME', scope='convt') self.name_to_var = {v.op.name: v for v in tf.global_variables()} with self.cached_session(): tf.global_variables_initializer().run() threshold = 1.0 flop_reg = flop_regularizer.GroupLassoFlopsRegularizer( [self.concat.op, self.convt.op], threshold=threshold, l1_fraction=0) with self.cached_session() as s: evaluated_vars = s.run(self.name_to_var) def group_norm(weights, axis=(0, 1, 2)): # pylint: disable=invalid-name return np.sqrt(np.mean(weights**2, axis=axis)) reg_vectors = { 'conv1': group_norm(evaluated_vars['conv1/weights'], (0, 1, 2)), 'conv2': group_norm(evaluated_vars['conv2/weights'], (0, 1, 2)), 'convt': group_norm(evaluated_vars['convt/weights'], (0, 1, 3)) } num_alive = {k: np.sum(r > threshold) for k, r in reg_vectors.items()} total_outputs = (reg_vectors['conv1'].shape[0] + reg_vectors['conv2'].shape[0]) total_alive_outputs = sum(num_alive.values()) assert total_alive_outputs > 0, ( 'All outputs are dead - test is trivial. Decrease the threshold.') assert total_alive_outputs < total_outputs, ( 'All outputs are alive - test is trivial. Increase the threshold.') coeff1 = _coeff(_get_op('conv1/Conv2D')) coeff2 = _coeff(_get_op('conv2/Conv2D')) coefft = _coeff(_get_op('convt/conv2d_transpose')) expected_flop_cost = NUM_CHANNELS * (coeff1 * num_alive['conv1'] + coeff2 * num_alive['conv2'] + coefft * num_alive['convt']) expected_reg_term = NUM_CHANNELS * ( coeff1 * np.sum(reg_vectors['conv1']) + coeff2 * np.sum(reg_vectors['conv2']) + coefft * np.sum(reg_vectors['convt'])) with self.cached_session(): self.assertEqual(round(expected_flop_cost), round(flop_reg.get_cost().eval())) self.assertNearRelatively( expected_reg_term, flop_reg.get_regularization_term().eval())
data_stat += s print(s) train_y[train_y < rmin] = rmin valid_y[valid_y < rmin] = rmin for i in range(len(test_y)): test_y[i][test_y[i] < rmin] = rmin data_stat += get_stat('train', train_x, train_y) data_stat += get_stat('valid', valid_x, valid_y) for i in range(len(test_x)): data_stat += get_stat('test', test_x[i], test_y[i]) model = construct_graph(args) init = tf.global_variables_initializer() best_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) # set seeds gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.memory) time_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess, open('log/'+str(args.save_name)+'_'+time_now+'_'+str(args.seed)+'.log','w') as fp: summary_writer = tf.summary.FileWriter('summary/%s_%s' % (str(args.save_name), str(args.seed)), graph=tf.get_default_graph()) fp.write(' '.join(sys.argv)+'\n') fp.write(git_log()+'\n') fp.write(data_stat+'\n') sess.run(init) if args.resume_training != "": best_saver.restore(sess, args.resume_training)
def build_example(label, param_dict_real, zip_path_label): """Build the model with parameter values set in param_dict_real. Args: label: Label of the model param_dict_real: Parameter dictionary (arguments to the factories make_graph and make_test_inputs) zip_path_label: Filename in the zip Returns: (tflite_model_binary, report) where tflite_model_binary is the serialized flatbuffer as a string and report is a dictionary with keys `toco_log` (log of toco conversion), `tf_log` (log of tf conversion), `toco` (a string of success status of the conversion), `tf` (a string success status of the conversion). """ np.random.seed(RANDOM_SEED) report = { "converter": report_lib.NOTRUN, "tf": report_lib.FAILED } # Build graph report["tf_log"] = "" report["converter_log"] = "" tf.reset_default_graph() with tf.Graph().as_default(): with tf.device("/cpu:0"): try: inputs, outputs = make_graph(param_dict_real) except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError, ValueError): report["tf_log"] += traceback.format_exc() return None, report sess = tf.Session() try: baseline_inputs, baseline_outputs = (make_test_inputs( param_dict_real, sess, inputs, outputs)) except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError, ValueError): report["tf_log"] += traceback.format_exc() return None, report report["converter"] = report_lib.FAILED report["tf"] = report_lib.SUCCESS # Convert graph to toco input_tensors = [(input_tensor.name.split(":")[0], input_tensor.shape, input_tensor.dtype) for input_tensor in inputs] output_tensors = [ _normalize_output_name(out.name) for out in outputs ] # pylint: disable=g-long-ternary graph_def = freeze_graph( sess, tf.global_variables() + inputs + outputs) if use_frozen_graph else sess.graph_def if "split_tflite_lstm_inputs" in param_dict_real: extra_toco_options.split_tflite_lstm_inputs = param_dict_real[ "split_tflite_lstm_inputs"] tflite_model_binary, toco_log = options.tflite_convert_function( options, graph_def, input_tensors, output_tensors, extra_toco_options=extra_toco_options, test_params=param_dict_real) report["converter"] = (report_lib.SUCCESS if tflite_model_binary is not None else report_lib.FAILED) report["converter_log"] = toco_log if options.save_graphdefs: zipinfo = zipfile.ZipInfo(zip_path_label + ".pbtxt") archive.writestr(zipinfo, text_format.MessageToString(graph_def), zipfile.ZIP_DEFLATED) if tflite_model_binary: if options.make_edgetpu_tests: # Set proper min max values according to input dtype. baseline_inputs, baseline_outputs = generate_inputs_outputs( tflite_model_binary, min_value=0, max_value=255) zipinfo = zipfile.ZipInfo(zip_path_label + ".bin") archive.writestr(zipinfo, tflite_model_binary, zipfile.ZIP_DEFLATED) example = { "inputs": baseline_inputs, "outputs": baseline_outputs } example_fp = StringIO() write_examples(example_fp, [example]) zipinfo = zipfile.ZipInfo(zip_path_label + ".inputs") archive.writestr(zipinfo, example_fp.getvalue(), zipfile.ZIP_DEFLATED) example_fp2 = StringIO() write_test_cases(example_fp2, zip_path_label + ".bin", [example]) zipinfo = zipfile.ZipInfo(zip_path_label + "_tests.txt") archive.writestr(zipinfo, example_fp2.getvalue(), zipfile.ZIP_DEFLATED) zip_manifest_label = zip_path_label + " " + label if zip_path_label == label: zip_manifest_label = zip_path_label zip_manifest.append(zip_manifest_label + "\n") return tflite_model_binary, report
def __init__(self, predict: Union[Callable, tf.keras.Model, 'keras.Model'], mode: str, shape: tuple, kappa: float = 0., beta: float = .1, feature_range: tuple = (-1e10, 1e10), gamma: float = 0., ae_model: Union[tf.keras.Model, 'keras.Model'] = None, learning_rate_init: float = 1e-2, max_iterations: int = 1000, c_init: float = 10., c_steps: int = 10, eps: tuple = (1e-3, 1e-3), clip: tuple = (-100., 100.), update_num_grad: int = 1, no_info_val: Union[float, np.ndarray] = None, write_dir: str = None, sess: tf.Session = None) -> None: """ Initialize contrastive explanation method. Paper: https://arxiv.org/abs/1802.07623 Parameters ---------- predict Keras or TensorFlow model or any other model's prediction function returning class probabilities mode Find pertinant negatives ('PN') or pertinant positives ('PP') shape Shape of input data starting with batch size kappa Confidence parameter for the attack loss term beta Regularization constant for L1 loss term feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1x nb of features) for feature-wise ranges gamma Regularization constant for optional auto-encoder loss term ae_model Optional auto-encoder model used for loss regularization learning_rate_init Initial learning rate of optimizer max_iterations Maximum number of iterations for finding a PN or PP c_init Initial value to scale the attack loss term c_steps Number of iterations to adjust the constant scaling the attack loss term eps If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for eps[1] it should be (1x nb of features) clip Tuple with min and max clip ranges for both the numerical gradients and the gradients obtained from the TensorFlow graph update_num_grad If numerical gradients are used, they will be updated every update_num_grad iterations no_info_val Global or feature-wise value considered as containing no information write_dir Directory to write tensorboard files to sess Optional Tensorflow session that will be used if passed instead of creating or inferring one internally """ super().__init__(meta=copy.deepcopy(DEFAULT_META_CEM)) # get params for storage in meta params = locals() remove = ['self', 'predict', 'ae_model', 'sess', '__class__'] for key in remove: params.pop(key) self.meta['params'].update(params) self.predict = predict # check whether the model and the auto-encoder are Keras or TF models and get session is_model, is_model_keras, model_sess = _check_keras_or_tf(predict) is_ae, is_ae_keras, ae_sess = _check_keras_or_tf(ae_model) # TODO: check ae and model are compatible self.meta['params'].update(is_model=is_model, is_model_keras=is_model_keras, is_ae=is_ae, is_ae_keras=is_ae_keras) # if session provided, use it if isinstance(sess, tf.Session): self.sess = sess else: self.sess = model_sess if is_model: # Keras or TF model self.model = True classes = self.sess.run(self.predict(tf.convert_to_tensor(np.zeros(shape), dtype=tf.float32))).shape[1] else: self.model = False classes = self.predict(np.zeros(shape)).shape[1] self.mode = mode self.shape = shape self.kappa = kappa self.beta = beta self.gamma = gamma self.ae = ae_model self.batch_size = shape[0] self.max_iterations = max_iterations self.c_init = c_init self.c_steps = c_steps self.update_num_grad = update_num_grad self.eps = eps self.clip = clip self.write_dir = write_dir if type(no_info_val) == float: self.no_info_val = np.ones(shape) * no_info_val else: self.no_info_val = no_info_val # values regarded as containing no information # PNs will deviate away from these values while PPs will gravitate towards them self.no_info = tf.Variable(np.zeros(shape), dtype=tf.float32, name='no_info') # define tf variables for original and perturbed instances, and target labels self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig') self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv') # delta(k) self.adv_s = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv_s') # y(k) self.target = tf.Variable(np.zeros((self.batch_size, classes)), dtype=tf.float32, name='target') # define tf variable for constant used in FISTA optimization self.const = tf.Variable(np.zeros(self.batch_size), dtype=tf.float32, name='const') self.global_step = tf.Variable(0.0, trainable=False, name='global_step') # define placeholders that will be assigned to relevant variables self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig') self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv') self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s') self.assign_target = tf.placeholder(tf.float32, (self.batch_size, classes), name='assign_target') self.assign_const = tf.placeholder(tf.float32, [self.batch_size], name='assign_const') self.assign_no_info = tf.placeholder(tf.float32, shape, name='assign_no_info') # define conditions and values for element-wise shrinkage thresholding (eq.7) with tf.name_scope('shrinkage_thresholding') as scope: cond = [tf.cast(tf.greater(tf.subtract(self.adv_s, self.orig), self.beta), tf.float32), tf.cast(tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)), self.beta), tf.float32), tf.cast(tf.less(tf.subtract(self.adv_s, self.orig), tf.negative(self.beta)), tf.float32)] upper = tf.minimum(tf.subtract(self.adv_s, self.beta), tf.cast(feature_range[1], tf.float32)) lower = tf.maximum(tf.add(self.adv_s, self.beta), tf.cast(feature_range[0], tf.float32)) self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply(cond[1], self.orig) + tf.multiply(cond[2], lower) # perturbation update for delta and vector projection on correct set depending on PP or PN (eq.5) # delta(k) = adv; delta(k+1) = assign_adv with tf.name_scope('perturbation_delta') as scope: proj_d = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32), tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)] if self.mode == "PP": self.assign_adv = tf.multiply(proj_d[1], self.assign_adv) + tf.multiply(proj_d[0], self.orig) elif self.mode == "PN": self.assign_adv = tf.multiply(proj_d[0], self.assign_adv) + tf.multiply(proj_d[1], self.orig) # perturbation update and vector projection on correct set for y: y(k+1) = assign_adv_s (eq.6) with tf.name_scope('perturbation_y') as scope: self.zt = tf.divide(self.global_step, self.global_step + tf.cast(3, tf.float32)) # k/(k+3) in (eq.6) self.assign_adv_s = self.assign_adv + tf.multiply(self.zt, self.assign_adv - self.adv) proj_d_s = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32), tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)] if self.mode == "PP": self.assign_adv_s = tf.multiply(proj_d_s[1], self.assign_adv_s) + tf.multiply(proj_d_s[0], self.orig) elif self.mode == "PN": self.assign_adv_s = tf.multiply(proj_d_s[0], self.assign_adv_s) + tf.multiply(proj_d_s[1], self.orig) # delta(k) <- delta(k+1); y(k) <- y(k+1) with tf.name_scope('update_adv') as scope: self.adv_updater = tf.assign(self.adv, self.assign_adv) self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s) # from perturbed instance, derive deviation delta with tf.name_scope('update_delta') as scope: self.delta = self.orig - self.adv self.delta_s = self.orig - self.adv_s # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA ax_sum = list(np.arange(1, len(shape))) with tf.name_scope('loss_l1_l2') as scope: self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum) self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum) self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum) self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum) self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta) self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta) # sum losses self.loss_l1 = tf.reduce_sum(self.l1) self.loss_l1_s = tf.reduce_sum(self.l1_s) self.loss_l2 = tf.reduce_sum(self.l2) self.loss_l2_s = tf.reduce_sum(self.l2_s) with tf.name_scope('loss_ae') as scope: # gamma * AE loss if self.mode == "PP" and callable(self.ae): self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.delta) - self.delta)) self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.delta_s) - self.delta_s)) elif self.mode == "PN" and callable(self.ae): self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.adv) - self.adv)) self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.adv_s) - self.adv_s)) else: # no auto-encoder available self.loss_ae = tf.constant(0.) self.loss_ae_s = tf.constant(0.) with tf.name_scope('loss_attack') as scope: if not self.model: self.loss_attack = tf.placeholder(tf.float32) else: # make predictions on perturbed instance (PN) or delta (PP) if self.mode == "PP": self.pred_proba = self.predict(self.delta) self.pred_proba_s = self.predict(self.delta_s) elif self.mode == "PN": self.pred_proba = self.predict(self.adv) self.pred_proba_s = self.predict(self.adv_s) # probability of target label prediction self.target_proba = tf.reduce_sum(self.target * self.pred_proba, 1) target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s, 1) # max probability of non target label prediction self.nontarget_proba_max = tf.reduce_max((1 - self.target) * self.pred_proba - (self.target * 10000), 1) nontarget_proba_max_s = tf.reduce_max((1 - self.target) * self.pred_proba_s - (self.target * 10000), 1) # loss term f(x,d) for PP (eq.4) and PN (eq.2) if self.mode == "PP": loss_attack = tf.maximum(0.0, self.nontarget_proba_max - self.target_proba + self.kappa) loss_attack_s = tf.maximum(0.0, nontarget_proba_max_s - target_proba_s + self.kappa) elif self.mode == "PN": loss_attack = tf.maximum(0.0, -self.nontarget_proba_max + self.target_proba + self.kappa) loss_attack_s = tf.maximum(0.0, -nontarget_proba_max_s + target_proba_s + self.kappa) # c * f(x,d) self.loss_attack = tf.reduce_sum(self.const * loss_attack) self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s) with tf.name_scope('loss_combined') as scope: # no need for L1 term in loss to optimize when using FISTA if self.model: self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s else: # separate numerical computation of loss attack gradient self.loss_opt = self.loss_l2_s + self.loss_ae_s # add L1 term to overall loss; this is not the loss that will be directly optimized self.loss_total = self.loss_attack + self.loss_l2 + self.loss_ae + tf.multiply(self.beta, self.loss_l1) with tf.name_scope('training') as scope: self.learning_rate = tf.train.polynomial_decay(learning_rate_init, self.global_step, self.max_iterations, 0, power=0.5) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) start_vars = set(x.name for x in tf.global_variables()) # first compute, then apply grads self.compute_grads = optimizer.compute_gradients(self.loss_opt, var_list=[self.adv_s]) self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s') var = [tvar for tvar in tf.trainable_variables() if tvar.name.startswith('adv_s')][-1] # get the last in # case explainer is re-initialized and a new graph is created grad_and_var = [(self.grad_ph, var)] self.apply_grads = optimizer.apply_gradients(grad_and_var, global_step=self.global_step) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] # variables to initialize self.setup = [] # type: list self.setup.append(self.orig.assign(self.assign_orig)) self.setup.append(self.target.assign(self.assign_target)) self.setup.append(self.const.assign(self.assign_const)) self.setup.append(self.adv.assign(self.assign_adv)) self.setup.append(self.adv_s.assign(self.assign_adv_s)) self.setup.append(self.no_info.assign(self.assign_no_info)) self.init = tf.variables_initializer(var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars) if self.write_dir is not None: writer = tf.summary.FileWriter(write_dir, tf.get_default_graph()) writer.add_graph(tf.get_default_graph())
def collect_variables(self): """Collect model variables call needs to be run at least once.""" self.var_list = [ v for v in tf.global_variables() if "emission_network" in v.name ] self.init_op = tf.variables_initializer(var_list=self.var_list)
def __init__(self, config, batch_size, checkpoint_dir_or_path=None, var_name_substitutions=None, session_target='', **sample_kwargs): if tf.gfile.IsDirectory(checkpoint_dir_or_path): checkpoint_path = tf.train.latest_checkpoint( checkpoint_dir_or_path) else: checkpoint_path = checkpoint_dir_or_path self._config = copy.deepcopy(config) self._config.data_converter.set_mode('infer') self._config.hparams.batch_size = batch_size with tf.Graph().as_default(): model = self._config.model model.build(self._config.hparams, self._config.data_converter.output_depth, is_training=False) # Input placeholders self._temperature = tf.placeholder(tf.float32, shape=()) if self._config.hparams.z_size: self._z_input = tf.placeholder( tf.float32, shape=[batch_size, self._config.hparams.z_size]) else: self._z_input = None if self._config.data_converter.control_depth > 0: self._c_input = tf.placeholder( tf.float32, shape=[None, self._config.data_converter.control_depth]) else: self._c_input = None self._inputs = tf.placeholder( tf.float32, shape=[ batch_size, None, self._config.data_converter.input_depth ]) self._controls = tf.placeholder( tf.float32, shape=[ batch_size, None, self._config.data_converter.control_depth ]) self._inputs_length = tf.placeholder( tf.int32, shape=[batch_size] + list(self._config.data_converter.length_shape)) self._max_length = tf.placeholder(tf.int32, shape=()) # Outputs self._outputs, self._decoder_results = model.sample( batch_size, max_length=self._max_length, z=self._z_input, c_input=self._c_input, temperature=self._temperature, **sample_kwargs) if self._config.hparams.z_size: q_z = model.encode(self._inputs, self._inputs_length, self._controls) self._mu = q_z.loc self._sigma = q_z.scale.diag self._z = q_z.sample() var_map = None if var_name_substitutions is not None: var_map = {} for v in tf.global_variables(): var_name = v.name[:-2] # Strip ':0' suffix. for pattern, substitution in var_name_substitutions: var_name = re.sub(pattern, substitution, var_name) if var_name != v.name[:-2]: tf.logging.info('Renaming `%s` to `%s`.', v.name[:-2], var_name) var_map[var_name] = v # Restore graph self._sess = tf.Session(target=session_target) saver = tf.train.Saver(var_map) if (os.path.exists(checkpoint_path) and tarfile.is_tarfile(checkpoint_path)): tf.logging.info('Unbundling checkpoint.') with tempfile.TemporaryDirectory() as temp_dir: tar = tarfile.open(checkpoint_path) tar.extractall(temp_dir) # Assume only a single checkpoint is in the directory. for name in tar.getnames(): if name.endswith('.index'): checkpoint_path = os.path.join( temp_dir, name[0:-6]) break saver.restore(self._sess, checkpoint_path) else: saver.restore(self._sess, checkpoint_path)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size import tensorflow.compat.v1 as tf tf.enable_v2_behavior() self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) ''' def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) ''' def sampled_loss(labels, logits): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(tf.transpose(w), b, labels, logits, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(size) cell = single_cell cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.5) if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): #return tf.nn.seq2seq.embedding_attention_seq2seq( return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: import tensorflow as tf #tf.disable_v2_behavior() #self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.AdamOptimizer() for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
var_weights = np.transpose(var_weights, (2, 3, 1, 0)) ptr += num_params assign_ops.append( tf.assign(var1, var_weights, validate_shape=True)) i += 1 # print('ptr:', ptr) return assign_ops if __name__ == '__main__': args = parser.parse_args() img_size = 512 tf.disable_eager_execution() # for placeholders with tf.name_scope('input'): input_data = tf.placeholder(dtype=tf.float32,shape=(None, img_size, img_size, 3), name='input_data') model = YOLOv3(input_data, num_classes=args.num_classes) load_ops = load_weights(tf.global_variables(), args.weight_file) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: sess.run(load_ops) ckpt_path = os.path.join(args.save_path, 'yolov3_pretrained.ckpt') save_path = saver.save(sess, save_path=ckpt_path) print('Model saved in path: {}'.format(save_path))
def model_fn(features, labels, mode, params): """The model_fn argument for creating an Estimator.""" tf.logging.info("features = %s labels = %s mode = %s params=%s" % (features, labels, mode, params)) global_step = tf.train.get_global_step() graph = mtf.Graph() mesh = mtf.Mesh(graph, "my_mesh") logits, loss = mnist_model(features, labels, mesh) mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) layout_rules = mtf.convert_to_layout_rules(FLAGS.layout) mesh_size = mesh_shape.size mesh_devices = [""] * mesh_size mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, mesh_devices) if mode == tf.estimator.ModeKeys.TRAIN: var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) optimizer = mtf.optimize.AdafactorOptimizer() update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) restore_hook = mtf.MtfRestoreHook(lowering) tf_logits = lowering.export_to_tf_tensor(logits) if mode != tf.estimator.ModeKeys.PREDICT: tf_loss = lowering.export_to_tf_tensor(loss) tf.summary.scalar("loss", tf_loss) if mode == tf.estimator.ModeKeys.TRAIN: tf_update_ops = [lowering.lowered_operation(op) for op in update_ops] tf_update_ops.append(tf.assign_add(global_step, 1)) train_op = tf.group(tf_update_ops) saver = tf.train.Saver( tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( FLAGS.model_dir, save_steps=1000, saver=saver, listeners=[saver_listener]) accuracy = tf.metrics.accuracy( labels=labels, predictions=tf.argmax(tf_logits, axis=1)) # Name tensors to be logged with LoggingTensorHook. tf.identity(tf_loss, "cross_entropy") tf.identity(accuracy[1], name="train_accuracy") # Save accuracy scalar to Tensorboard output. tf.summary.scalar("train_accuracy", accuracy[1]) # restore_hook must come before saver_hook return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_chief_hooks=[restore_hook, saver_hook]) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "classes": tf.argmax(tf_logits, axis=1), "probabilities": tf.nn.softmax(tf_logits), } return tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions, prediction_hooks=[restore_hook], export_outputs={ "classify": tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=tf_loss, evaluation_hooks=[restore_hook], eval_metric_ops={ "accuracy": tf.metrics.accuracy( labels=labels, predictions=tf.argmax(tf_logits, axis=1)), })
def _init_graph(self) -> None: # Collect inputs. self.input_names = [] for param in inspect.signature(self._build_func).parameters.values(): if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty: self.input_names.append(param.name) self.num_inputs = len(self.input_names) assert self.num_inputs >= 1 # Choose name and scope. if self.name is None: self.name = self._build_func_name assert re.match("^[A-Za-z0-9_.\\-]*$", self.name) with tf.name_scope(None): self.scope = tf.get_default_graph().unique_name(self.name, mark_as_used=True) # Finalize build func kwargs. build_kwargs = dict(self.static_kwargs) build_kwargs["is_template_graph"] = True build_kwargs["components"] = self.components # Build template graph. with tfutil.absolute_variable_scope(self.scope, reuse=tf.AUTO_REUSE), tfutil.absolute_name_scope(self.scope): # ignore surrounding scopes assert tf.get_variable_scope().name == self.scope assert tf.get_default_graph().get_name_scope() == self.scope with tf.control_dependencies(None): # ignore surrounding control dependencies self.input_templates = [tf.placeholder(tf.float32, name=name) for name in self.input_names] out_expr = self._build_func(*self.input_templates, **build_kwargs) # Collect outputs. assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple) self.output_templates = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr) self.num_outputs = len(self.output_templates) assert self.num_outputs >= 1 assert all(tfutil.is_tf_expression(t) for t in self.output_templates) # Perform sanity checks. if any(t.shape.ndims is None for t in self.input_templates): raise ValueError("Network input shapes not defined. Please call x.set_shape() for each input.") if any(t.shape.ndims is None for t in self.output_templates): raise ValueError("Network output shapes not defined. Please call x.set_shape() where applicable.") if any(not isinstance(comp, Network) for comp in self.components.values()): raise ValueError("Components of a Network must be Networks themselves.") if len(self.components) != len(set(comp.name for comp in self.components.values())): raise ValueError("Components of a Network must have unique names.") # List inputs and outputs. self.input_shapes = [tfutil.shape_to_list(t.shape) for t in self.input_templates] self.output_shapes = [tfutil.shape_to_list(t.shape) for t in self.output_templates] self.input_shape = self.input_shapes[0] self.output_shape = self.output_shapes[0] self.output_names = [t.name.split("/")[-1].split(":")[0] for t in self.output_templates] # List variables. self.own_vars = OrderedDict((var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/")) self.vars = OrderedDict(self.own_vars) self.vars.update((comp.name + "/" + name, var) for comp in self.components.values() for name, var in comp.vars.items()) self.trainables = OrderedDict((name, var) for name, var in self.vars.items() if var.trainable) self.var_global_to_local = OrderedDict((var.name.split(":")[0], name) for name, var in self.vars.items())
def infer(output_path): """Run inference.""" tf.logging.info("Run inference") checkpoint = FLAGS.checkpoint_dir with tf.gfile.GFile(os.path.join(checkpoint, "config.json")) as f: config = json.load(f) if FLAGS.num_gpus > 1: config["num_gpus"] = FLAGS.num_gpus tf.logging.info("# infer config %s", config) # Load the data tf.logging.info("Loading data") features = _get_data(split=FLAGS.split, batch_size=FLAGS.batch_size) tf.logging.info("Loading model") model_class = configurable.Configurable.load(config["model"]) model = model_class("eval", config=config["model"]) outputs = model(features) tf.logging.info(outputs) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) tf.logging.info(config["model"]["optimizer"]) # Get trainable / frozen vars trainable_vars, frozen_vars, _ = misc_util.get_trainable_vars( all_vars=tf.global_variables(), exclude_pattern=config["model"]["optimizer"]["nograd_var"]) # Make sure to load in the exponential moving average vars ema = tf.train.ExponentialMovingAverage(decay=0.9999) ema_vars = {} for var in trainable_vars: ema_vars[ema.average_name(var)] = var # Restoring EMA trainable vars tf.logging.info("Restoring ema.") saver = tf.train.Saver(ema_vars) ckpt_path = tf.train.latest_checkpoint(checkpoint) saver.restore(sess, ckpt_path) # Restoring frozen vars tf.logging.info("Restoring frozen.") saver_frozen = tf.train.Saver(frozen_vars) saver_frozen.restore(sess, ckpt_path) # Setup scaffolding and load in the saved model coord = tf.train.Coordinator() _ = tf.train.start_queue_runners(coord=coord, sess=sess) is_tgq = FLAGS.data_format == "tgq" result = {} try: i = 0 while True: predictions = sess.run(outputs) for qid, answer, start, end in zip(predictions["id"], predictions["a"], predictions["p1"], predictions["p2"]): if FLAGS.include_probabilities or is_tgq: output = {"answer": answer} output["start_prob"] = list([float(x) for x in start]) output["end_prob"] = list([float(x) for x in end]) if is_tgq: start, end, _ = evaluator_util.get_start_end( output["start_prob"], output["end_prob"]) output["start"] = start output["end"] = end else: output = answer result[qid] = output if i % 100 == 0: tf.logging.info(i) i += 1 except errors.OutOfRangeError: pass # Dump results to a file with tf.gfile.GFile(output_path, "w") as f: if is_tgq: for qid in result: # NOTE(thangluong): from chrisalberti@'s observation, # we need to subtract 1 to get good results. # To investigate; we could have added bos (found no evidence yet). start = result[qid]["start"] - 1 end = result[qid]["end"] - 1 f.write("%s\t-1\t%d\t%d\n" % (qid, start, end)) else: json.dump(result, f)
def model_fn(features, labels, mode, params): """A model is called by TpuEstimator.""" del labels global_step = tf.train.get_global_step() graph = mtf.Graph() mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) layout_rules = mtf.convert_to_layout_rules(FLAGS.layout) if FLAGS.use_tpu: ctx = params['context'] num_hosts = ctx.num_hosts host_placement_fn = ctx.tpu_host_placement_function device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)] tf.logging.info('device_list = %s' % device_list, ) # TODO(ylc): Better estimation of replica cache size? replica_cache_size = 300 * 1000000 # 300M per replica # Worker 0 caches all the TPU binaries. worker0_mem = replica_cache_size * ctx.num_replicas devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1) var_placer = mtf.utils.BalancedVariablePlacer(device_list, devices_memeory_usage) mesh_devices = [''] * mesh_shape.size mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(mesh_shape, layout_rules, mesh_devices, ctx.device_assignment) else: var_placer = None mesh_devices = [''] * mesh_shape.size mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, mesh_devices) mesh = mtf.Mesh(graph, 'my_mesh', var_placer) with mtf.utils.outside_all_rewrites(): logits, loss = toy_model(features, mesh) # TRAIN mode if mode == tf.estimator.ModeKeys.TRAIN: var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) if FLAGS.optimizer == 'Adafactor': optimizer = mtf.optimize.AdafactorOptimizer() else: assert FLAGS.optimizer == 'SGD' optimizer = mtf.optimize.SgdOptimizer(learning_rate=FLAGS.lr) update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables) else: # for now, we can only export fully-replicated tensors. fully_replicated_logits = mtf.anonymize(logits) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_loss = tf.to_float(lowering.export_to_tf_tensor(loss)) if mode == tf.estimator.ModeKeys.TRAIN: tf_update_ops = [lowering.lowered_operation(op) for op in update_ops] tf_update_ops.append(tf.assign_add(global_step, 1)) tf.logging.info('tf_update_ops: {}'.format(tf_update_ops)) train_op = tf.group(tf_update_ops) else: tf_logits = lowering.export_to_tf_tensor(fully_replicated_logits) with mtf.utils.outside_all_rewrites(): # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) if mode == tf.estimator.ModeKeys.TRAIN: saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( FLAGS.model_dir, save_steps=1000, saver=saver, listeners=[saver_listener]) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_hooks=[restore_hook, saver_hook]) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(tf_logits): mean_logits = tf.metrics.mean(tf_logits) return {'mean_logits': mean_logits} eval_metrics = (metric_fn, [tf_logits]) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
def __init__( self, predict_fn: Union[Callable, tf.keras.Model, 'keras.Model'], shape: Tuple[int, ...], distance_fn: str = 'l1', target_proba: float = 1.0, target_class: Union[str, int] = 'other', max_iter: int = 1000, early_stop: int = 50, lam_init: float = 1e-1, max_lam_steps: int = 10, tol: float = 0.05, learning_rate_init=0.1, feature_range: Union[Tuple, str] = (-1e10, 1e10), eps: Union[float, np.ndarray] = 0.01, # feature-wise epsilons init: str = 'identity', decay: bool = True, write_dir: str = None, debug: bool = False, sess: tf.Session = None) -> None: """ Initialize counterfactual explanation method based on Wachter et al. (2017) Parameters ---------- predict_fn Keras or TensorFlow model or any other model's prediction function returning class probabilities shape Shape of input data starting with batch size distance_fn Distance function to use in the loss term target_proba Target probability for the counterfactual to reach target_class Target class for the counterfactual to reach, one of 'other', 'same' or an integer denoting desired class membership for the counterfactual instance max_iter Maximum number of interations to run the gradient descent for (inner loop) early_stop Number of steps after which to terminate gradient descent if all or none of found instances are solutions lam_init Initial regularization constant for the prediction part of the Wachter loss max_lam_steps Maximum number of times to adjust the regularization constant (outer loop) before terminating the search tol Tolerance for the counterfactual target probability learning_rate_init Initial learning rate for each outer loop of lambda feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1 x nb of features) for feature-wise ranges eps Gradient step sizes used in calculating numerical gradients, defaults to a single value for all features, but can be passed an array for feature-wise step sizes init Initialization method for the search of counterfactuals, currently must be 'identity' decay Flag to decay learning rate to zero for each outer loop over lambda write_dir Directory to write Tensorboard files to debug Flag to write Tensorboard summaries for debugging sess Optional Tensorflow session that will be used if passed instead of creating or inferring one internally """ super().__init__(meta=copy.deepcopy(DEFAULT_META_CF)) # get params for storage in meta params = locals() remove = ['self', 'predict_fn', 'sess', '__class__'] for key in remove: params.pop(key) self.meta['params'].update(params) self.data_shape = shape self.batch_size = shape[0] self.target_class = target_class # options for the optimizer self.max_iter = max_iter self.lam_init = lam_init self.tol = tol self.max_lam_steps = max_lam_steps self.early_stop = early_stop self.eps = eps self.init = init self.feature_range = feature_range self.target_proba_arr = target_proba * np.ones(self.batch_size) self.debug = debug # check if the passed object is a model and get session is_model, is_keras, model_sess = _check_keras_or_tf(predict_fn) self.meta['params'].update(is_model=is_model, is_keras=is_keras) # if session provided, use it if isinstance(sess, tf.Session): self.sess = sess else: self.sess = model_sess if is_model: # Keras or TF model self.model = True self.predict_fn = predict_fn.predict # type: ignore # array function self.predict_tn = predict_fn # tensor function else: # black-box model self.predict_fn = predict_fn self.predict_tn = None self.model = False self.n_classes = self.predict_fn(np.zeros(shape)).shape[1] # flag to keep track if explainer is fit or not self.fitted = False # set up graph session for optimization (counterfactual search) with tf.variable_scope('cf_search', reuse=tf.AUTO_REUSE): # define variables for original and candidate counterfactual instances, target labels and lambda self.orig = tf.get_variable('original', shape=shape, dtype=tf.float32) self.cf = tf.get_variable( 'counterfactual', shape=shape, dtype=tf.float32, constraint=lambda x: tf.clip_by_value(x, feature_range[0], feature_range[1])) # the following will be a 1-hot encoding of the target class (as predicted by the model) self.target = tf.get_variable('target', shape=(self.batch_size, self.n_classes), dtype=tf.float32) # constant target probability and global step variable self.target_proba = tf.constant(target_proba * np.ones(self.batch_size), dtype=tf.float32, name='target_proba') self.global_step = tf.Variable(0.0, trainable=False, name='global_step') # lambda hyperparameter - placeholder instead of variable as annealed in first epoch self.lam = tf.placeholder(tf.float32, shape=(self.batch_size), name='lam') # define placeholders that will be assigned to relevant variables self.assign_orig = tf.placeholder(tf.float32, shape, name='assing_orig') self.assign_cf = tf.placeholder(tf.float32, shape, name='assign_cf') self.assign_target = tf.placeholder(tf.float32, shape=(self.batch_size, self.n_classes), name='assign_target') # L1 distance and MAD constants # TODO: MADs? ax_sum = list(np.arange(1, len(self.data_shape))) if distance_fn == 'l1': self.dist = tf.reduce_sum(tf.abs(self.cf - self.orig), axis=ax_sum, name='l1') else: logger.exception('Distance metric %s not supported', distance_fn) raise ValueError # distance loss self.loss_dist = self.lam * self.dist # prediction loss if not self.model: # will need to calculate gradients numerically self.loss_opt = self.loss_dist else: # autograd gradients throughout self.pred_proba = self.predict_tn(self.cf) # 3 cases for target_class if target_class == 'same': self.pred_proba_class = tf.reduce_max( self.target * self.pred_proba, 1) elif target_class == 'other': self.pred_proba_class = tf.reduce_max( (1 - self.target) * self.pred_proba, 1) elif target_class in range(self.n_classes): # if class is specified, this is known in advance self.pred_proba_class = tf.reduce_max( tf.one_hot( target_class, self.n_classes, dtype=tf.float32) * self.pred_proba, 1) else: logger.exception('Target class %s unknown', target_class) raise ValueError self.loss_pred = tf.square(self.pred_proba_class - self.target_proba) self.loss_opt = self.loss_pred + self.loss_dist # optimizer if decay: self.learning_rate = tf.train.polynomial_decay( learning_rate_init, self.global_step, self.max_iter, 0.0, power=1.0) else: self.learning_rate = tf.convert_to_tensor(learning_rate_init) # TODO optional argument to change type, learning rate scheduler opt = tf.train.AdamOptimizer(self.learning_rate) # first compute gradients, then apply them self.compute_grads = opt.compute_gradients(self.loss_opt, var_list=[self.cf]) self.grad_ph = tf.placeholder(shape=shape, dtype=tf.float32, name='grad_cf') grad_and_var = [(self.grad_ph, self.cf)] self.apply_grads = opt.apply_gradients( grad_and_var, global_step=self.global_step) # variables to initialize self.setup = [] # type: list self.setup.append(self.orig.assign(self.assign_orig)) self.setup.append(self.cf.assign(self.assign_cf)) self.setup.append(self.target.assign(self.assign_target)) self.tf_init = tf.variables_initializer(var_list=tf.global_variables( scope='cf_search')) # tensorboard if write_dir is not None: self.writer = tf.summary.FileWriter(write_dir, tf.get_default_graph()) self.writer.add_graph(tf.get_default_graph()) # return templates self.instance_dict = dict.fromkeys( ['X', 'distance', 'lambda', 'index', 'class', 'proba', 'loss']) self.return_dict = copy.deepcopy(DEFAULT_DATA_CF) self.return_dict['all'] = {i: [] for i in range(self.max_lam_steps)}
def testFlopRegularizerWithContribFC(self): """Test MatMul Flop regularizer with tf.contrib.fully_connected layer. The structure of the fully connected network used in this test is the same with that used in testFlopRegularizerWithMatMul. """ tf.reset_default_graph() tf.set_random_seed(1234) # Create test networks with tf.contrib.layers.fully_connected and initialize # the variables. with slim.arg_scope([contrib_layers.fully_connected], weights_initializer=tf.random_normal_initializer, biases_initializer=tf.random_normal_initializer): x = tf.constant(1.0, shape=[2, 6], name='x', dtype=tf.float32) net = contrib_layers.fully_connected(x, 4, scope='matmul1') net = contrib_layers.fully_connected(net, 1, scope='matmul2') name_to_variable = {v.op.name: v for v in tf.global_variables()} with self.cached_session(): tf.global_variables_initializer().run() # Create FLOPs network regularizer. threshold = 0.9 flop_reg = flop_regularizer.GroupLassoFlopsRegularizer([net.op], threshold, 0) with self.cached_session() as session: evaluated_vars = session.run(name_to_variable) # Compute the regularizer vector for each layer. def group_norm(weights, axis=(0, 1, 2)): # pylint: disable=invalid-name return np.sqrt(np.mean(weights**2, axis=axis)) regularizer_vec = { 'matmul1': group_norm(evaluated_vars['matmul1/weights'], axis=(0, )), 'matmul2': group_norm(evaluated_vars['matmul2/weights'], axis=(0, )) } # Sanity check to make sure that not all outputs are alive or dead. total_outputs = (regularizer_vec['matmul1'].shape[0] + regularizer_vec['matmul2'].shape[0]) total_alive = sum( [np.sum(val > threshold) for val in regularizer_vec.values()]) assert total_alive > 0, ( 'All outputs are dead. Decrease the threshold.') assert total_alive < total_outputs, ( 'All outputs are alive. Increase the threshold.') # Compute the expected flop cost and regularization term. The L2 norm of # columns in weight matrix of layer matmul1 is [2.15381098, 2.57671237, # 2.12560201, 2.2081387] and that of layer matmul2 is [1.72404861]. With # threshold = 2.2, there are two outputs in matmul1 layer are alive. matmul1_live_input = 6 matmul1_live_output = sum(regularizer_vec['matmul1'] > threshold) expected_flop_cost = (_coeff(_get_op('matmul1/MatMul')) * matmul1_live_input * matmul1_live_output) regularizer1 = np.sum(regularizer_vec['matmul1']) regularizer2 = np.sum(regularizer_vec['matmul2']) expected_reg_term = (_coeff(_get_op('matmul1/MatMul')) * matmul1_live_input * regularizer1 + _coeff(_get_op('matmul2/MatMul')) * matmul1_live_output * regularizer2) with self.cached_session() as session: self.assertEqual(round(flop_reg.get_cost().eval()), round(expected_flop_cost)) self.assertNearRelatively( flop_reg.get_regularization_term().eval(), expected_reg_term)
def forward(self, inputs, init_with_pretrain_vgg=False, pre_trained_model='./vgg16/vgg_16.ckpt'): # feature extraction part and also the share network reuse_fnet = len( [v for v in tf.global_variables() if v.name.startswith('FNet')]) > 0 with tf.variable_scope('FNet', reuse=reuse_fnet): # feature extraction self.conv1_1 = self._conv2d(inputs, dim=64, name='conv1_1') self.conv1_2 = self._conv2d(self.conv1_1, dim=64, name='conv1_2') self.pool1 = self._max_pool2d(self.conv1_2) # 256 => /2 self.conv2_1 = self._conv2d(self.pool1, dim=128, name='conv2_1') self.conv2_2 = self._conv2d(self.conv2_1, dim=128, name='conv2_2') self.pool2 = self._max_pool2d(self.conv2_2) # 128 => /4 self.conv3_1 = self._conv2d(self.pool2, dim=256, name='conv3_1') self.conv3_2 = self._conv2d(self.conv3_1, dim=256, name='conv3_2') self.conv3_3 = self._conv2d(self.conv3_2, dim=256, name='conv3_3') self.pool3 = self._max_pool2d(self.conv3_3) # 64 => /8 self.conv4_1 = self._conv2d(self.pool3, dim=512, name='conv4_1') self.conv4_2 = self._conv2d(self.conv4_1, dim=512, name='conv4_2') self.conv4_3 = self._conv2d(self.conv4_2, dim=512, name='conv4_3') self.pool4 = self._max_pool2d(self.conv4_3) # 32 => /16 self.conv5_1 = self._conv2d(self.pool4, dim=512, name='conv5_1') self.conv5_2 = self._conv2d(self.conv5_1, dim=512, name='conv5_2') self.conv5_3 = self._conv2d(self.conv5_2, dim=512, name='conv5_3') self.pool5 = self._max_pool2d(self.conv5_3) # 16 => /32 # init feature extraction part from pre-train vgg16 if init_with_pretrain_vgg: tf.train.init_from_checkpoint(pre_trained_model, self.pre_train_restore_map) # input size for logits predict [n, h, w, c] = inputs.shape.as_list() reuse_cw_net = len( [v for v in tf.global_variables() if v.name.startswith('CWNet')]) > 0 with tf.variable_scope('CWNet', reuse=reuse_cw_net): # upsample up2 = ( self._upconv2d(self.pool5, dim=256, act='linear', name='up2_1') # 32 => /16 + self._conv2d( self.pool4, dim=256, act='linear', name='pool4_s')) self.up2_cw = self._conv2d(up2, dim=256, name='up2_3') up4 = ( self._upconv2d( self.up2_cw, dim=128, act='linear', name='up4_1') # 64 => /8 + self._conv2d( self.pool3, dim=128, act='linear', name='pool3_s')) self.up4_cw = self._conv2d(up4, dim=128, name='up4_3') up8 = ( self._upconv2d(self.up4_cw, dim=64, act='linear', name='up8_1') # 128 => /4 + self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s')) self.up8_cw = self._conv2d(up8, dim=64, name='up8_2') up16 = ( self._upconv2d( self.up8_cw, dim=32, act='linear', name='up16_1') # 256 => /2 + self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s')) self.up16_cw = self._conv2d(up16, dim=32, name='up16_2') # predict logits logits_cw = self._up_bilinear(self.up16_cw, dim=3, shape=(h, w), name='logits') # decode network for room type detection reuse_rnet = len( [v for v in tf.global_variables() if v.name.startswith('RNet')]) > 0 with tf.variable_scope('RNet', reuse=reuse_rnet): # upsample up2 = ( self._upconv2d(self.pool5, dim=256, act='linear', name='up2_1') # 32 => /16 + self._conv2d( self.pool4, dim=256, act='linear', name='pool4_s')) up2 = self._conv2d(up2, dim=256, name='up2_2') up2, _ = self._non_local_context(self.up2_cw, up2, name='context_up2') up4 = ( self._upconv2d(up2, dim=128, act='linear', name='up4_1') # 64 => /8 + self._conv2d( self.pool3, dim=128, act='linear', name='pool3_s')) up4 = self._conv2d(up4, dim=128, name='up4_2') up4, _ = self._non_local_context(self.up4_cw, up4, name='context_up4') up8 = ( self._upconv2d(up4, dim=64, act='linear', name='up8_1') # 128 => /4 + self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s')) up8 = self._conv2d(up8, dim=64, name='up8_2') up8, _ = self._non_local_context(self.up8_cw, up8, name='context_up8') up16 = ( self._upconv2d(up8, dim=32, act='linear', name='up16_1') # 256 => /2 + self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s')) up16 = self._conv2d(up16, dim=32, name='up16_2') self.up16_r, self.a = self._non_local_context(self.up16_cw, up16, name='context_up16') # predict logits logits_r = self._up_bilinear(self.up16_r, dim=9, shape=(h, w), name='logits') return logits_r, logits_cw
def main(_): tf.logging.set_verbosity(tf.logging.INFO) model_dir = os.path.expanduser(FLAGS.model_dir) output_dir = os.path.expanduser(FLAGS.output_dir) out_base_file = os.path.join(output_dir, "model.ckpt") # Copy flags.txt with the original time, so t2t-bleu can report correct # relative time. tf.gfile.MakeDirs(FLAGS.output_dir) if (not os.path.exists(os.path.join(output_dir, "flags.txt")) and os.path.exists(os.path.join(model_dir, "flags.txt"))): shutil.copy2(os.path.join(model_dir, "flags.txt"), os.path.join(output_dir, "flags.txt")) models_processed = 0 queue = deque() for model in bleu_hook.stepfiles_iterator(model_dir, FLAGS.wait_minutes, FLAGS.min_steps): if models_processed == 0: var_list = tf.train.list_variables(model.filename) avg_values = {} for (name, shape) in var_list: if not (name.startswith("global_step") or name.startswith("train_stats/")): avg_values[name] = np.zeros(shape) models_processed += 1 tf.logging.info("Loading [%d]: %s" % (models_processed, model.filename)) reader = tf.train.load_checkpoint(model.filename) for name in avg_values: avg_values[name] += reader.get_tensor(name) / FLAGS.n queue.append(model) if len(queue) < FLAGS.n: continue out_file = "%s-%d" % (out_base_file, model.steps) tf_vars = [] tf.logging.info("Averaging %s" % (out_file)) for (name, value) in six.iteritems(avg_values): # TODO(martinpopel): dtype=var_dtypes[name] tf_vars.append(tf.get_variable(name, shape=value.shape)) placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars] assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] global_step = tf.get_variable( "global_step", initializer=tf.constant(model.steps, dtype=tf.int64), trainable=False) with tf.variable_scope("train_stats"): tf.get_variable("problem_0_steps", initializer=0, trainable=False) saver = tf.train.Saver(tf.global_variables()) tf.logging.info("Running session for %s" % (out_file)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for p, assign_op, (name, value) in zip( placeholders, assign_ops, six.iteritems(avg_values)): sess.run(assign_op, {p: value}) tf.logging.info("Storing to %s" % out_file) saver.save(sess, out_base_file, global_step=global_step) os.utime(out_file + ".index", (model.mtime, model.mtime)) tf.reset_default_graph() first_model = queue.popleft() reader = tf.train.load_checkpoint(first_model.filename) for name in avg_values: avg_values[name] -= reader.get_tensor(name) / FLAGS.n
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, v_mix_coef=0.5, max_grad_norm=0.5, lr_alpha=7e-4, lr_beta=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', r_ex_coef=1.0, r_in_coef=0.0, v_ex_coef=1.0): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch], 'A') R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX') ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX') RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX') V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX') DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST') COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT') LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA') LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA') step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum( train_model.r_in * tf.one_hot(A, nact), axis=1) ret_mix = tf.squeeze( tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])), [1]) + DIS_V_MIX_LAST adv_mix = ret_mix - V_MIX neglogpac = train_model.pd.neglogp(A) pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac) v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix), ret_mix)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss policy_params = train_model.policy_params policy_grads = tf.gradients(policy_loss, policy_params) if max_grad_norm is not None: policy_grads, policy_grad_norm = tf.clip_by_global_norm( policy_grads, max_grad_norm) policy_grads_and_vars = list(zip(policy_grads, policy_params)) policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA, decay=alpha, epsilon=epsilon) policy_train = policy_trainer.apply_gradients(policy_grads_and_vars) rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params] policy_params_new = {} for grad, rms, var in zip(policy_grads, rmss, policy_params): ms = rms + (tf.square(grad) - rms) * (1 - alpha) policy_params_new[ var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon) policy_new = train_model.policy_new_fn(policy_params_new, ob_space, ac_space, nbatch, nsteps) neglogpac_new = policy_new.pd.neglogp(A) ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new) pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new) v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX)) intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss intrinsic_params = train_model.intrinsic_params intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params) if max_grad_norm is not None: intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm( intrinsic_grads, max_grad_norm) intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params)) intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA, decay=alpha, epsilon=epsilon) intrinsic_train = intrinsic_trainer.apply_gradients( intrinsic_grads_and_vars) lr_alpha = Scheduler(v=lr_alpha, nvalues=total_timesteps, schedule=lrschedule) lr_beta = Scheduler(v=lr_beta, nvalues=total_timesteps, schedule=lrschedule) all_params = tf.global_variables() def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex, v_mix, dis_v_mix_last, coef_mat): advs_ex = ret_ex - v_ex for step in range(len(obs)): cur_lr_alpha = lr_alpha.value() cur_lr_beta = lr_beta.value() td_map = { train_model.X: obs, policy_new.X: obs, A: actions, R_EX: r_ex, ADV_EX: advs_ex, RET_EX: ret_ex, V_MIX: v_mix, DIS_V_MIX_LAST: dis_v_mix_last, COEF_MAT: coef_mat, LR_ALPHA: cur_lr_alpha, LR_BETA: cur_lr_beta } if policy_states is not None: td_map[train_model.PS] = policy_states td_map[train_model.M] = masks return sess.run([entropy, policy_train, intrinsic_train], td_map)[0] def save(save_path): ps = sess.run(all_params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(all_params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.intrinsic_reward = step_model.intrinsic_reward self.init_policy_state = step_model.init_policy_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def _init_graph(self) -> None: assert self._var_inits is not None assert self._input_templates is None assert self._output_templates is None assert self._own_vars is None # Initialize components. if self._components is None: self._components = util.EasyDict() # Choose build func kwargs. build_kwargs = dict(self.static_kwargs) build_kwargs["is_template_graph"] = True build_kwargs["components"] = self._components # Override scope and device, and ignore surrounding control dependencies. with tfutil.absolute_variable_scope(self.scope, reuse=False), tfutil.absolute_name_scope(self.scope), tf.device(self.device), tf.control_dependencies(None): assert tf.get_variable_scope().name == self.scope assert tf.get_default_graph().get_name_scope() == self.scope # Create input templates. self._input_templates = [] for param in inspect.signature(self._build_func).parameters.values(): if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty: self._input_templates.append(tf.placeholder(tf.float32, name=param.name)) # Call build func. print(tf) out_expr = self._build_func(*self._input_templates, **build_kwargs) # Collect output templates and variables. assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple) self._output_templates = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr) self._own_vars = OrderedDict((var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/")) # Check for errors. if len(self._input_templates) == 0: raise ValueError("Network build func did not list any inputs.") if len(self._output_templates) == 0: raise ValueError("Network build func did not return any outputs.") if any(not tfutil.is_tf_expression(t) for t in self._output_templates): raise ValueError("Network outputs must be TensorFlow expressions.") if any(t.shape.ndims is None for t in self._input_templates): raise ValueError("Network input shapes not defined. Please call x.set_shape() for each input.") if any(t.shape.ndims is None for t in self._output_templates): raise ValueError("Network output shapes not defined. Please call x.set_shape() where applicable.") if any(not isinstance(comp, Network) for comp in self._components.values()): raise ValueError("Components of a Network must be Networks themselves.") if len(self._components) != len(set(comp.name for comp in self._components.values())): raise ValueError("Components of a Network must have unique names.") # Initialize variables. if len(self._var_inits): tfutil.set_vars({self._get_vars()[name]: value for name, value in self._var_inits.items() if name in self._get_vars()}) remaining_inits = [var.initializer for name, var in self._own_vars.items() if name not in self._var_inits] if self._all_inits_known: assert len(remaining_inits) == 0 else: tfutil.run(remaining_inits) self._var_inits = None
def rebuild_graph(self, path, model_name, full_assign=False, train_data=None): if train_data is None: raise ValueError("SVDpp model must provide train_data " "when rebuilding graph") sparse_implicit_interaction = sparse_tensor_interaction( train_data, recent_num=10) self._build_model(sparse_implicit_interaction) self._build_train_ops() variable_path = os.path.join(path, f"{model_name}_variables.npz") variables = np.load(variable_path) variables = dict(variables.items()) ( user_variables, item_variables, sparse_variables, dense_variables, manual_variables ) = modify_variable_names(self, trainable=True) update_ops = [] for v in tf.trainable_variables(): if user_variables is not None and v.name in user_variables: # no need to remove oov values old_var = variables[v.name] user_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(user_op)) if item_variables is not None and v.name in item_variables: old_var = variables[v.name] item_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(item_op)) if full_assign: ( optimizer_user_variables, optimizer_item_variables, optimizer_sparse_variables, optimizer_dense_variables, _ ) = modify_variable_names(self, trainable=False) other_variables = [v for v in tf.global_variables() if v.name not in manual_variables] for v in other_variables: if (optimizer_user_variables is not None and v.name in optimizer_user_variables): old_var = variables[v.name] user_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(user_op)) elif (optimizer_item_variables is not None and v.name in optimizer_item_variables): old_var = variables[v.name] item_op = tf.IndexedSlices(old_var, tf.range(len(old_var))) update_ops.append(v.scatter_update(item_op)) else: old_var = variables[v.name] update_ops.append(v.assign(old_var)) self.sess.run(update_ops)
def train(): # Create training and validation datasets train_set = create_dataset(FLAGS.train_files.split(','), batch_size=FLAGS.train_batch_size, cache_path=FLAGS.feature_cache, train_phase=True) iterator = tfv1.data.Iterator.from_structure( tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), output_classes=tfv1.data.get_output_classes(train_set)) # Make initialization ops for switching between the two sets train_init_op = iterator.make_initializer(train_set) if FLAGS.dev_files: dev_csvs = FLAGS.dev_files.split(',') dev_sets = [ create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False) for csv in dev_csvs ] dev_init_ops = [ iterator.make_initializer(dev_set) for dev_set in dev_sets ] # Dropout dropout_rates = [ tfv1.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6) ] dropout_feed_dict = { dropout_rates[0]: FLAGS.dropout_rate, dropout_rates[1]: FLAGS.dropout_rate2, dropout_rates[2]: FLAGS.dropout_rate3, dropout_rates[3]: FLAGS.dropout_rate4, dropout_rates[4]: FLAGS.dropout_rate5, dropout_rates[5]: FLAGS.dropout_rate6, } no_dropout_feed_dict = {rate: 0. for rate in dropout_rates} # Building the graph optimizer = create_optimizer() gradients, loss, non_finite_files = get_tower_results( iterator, optimizer, dropout_rates) # Average tower gradients across GPUs avg_tower_gradients = average_gradients(gradients) log_grads_and_vars(avg_tower_gradients) # global_step is automagically incremented by the optimizer global_step = tfv1.train.get_or_create_global_step() apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) # Summaries step_summaries_op = tfv1.summary.merge_all('step_summaries') step_summary_writers = { 'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), 'dev': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120) } # Checkpointing checkpoint_saver = tfv1.train.Saver(max_to_keep=FLAGS.max_to_keep) checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train') checkpoint_filename = 'checkpoint' best_dev_saver = tfv1.train.Saver(max_to_keep=1) best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev') best_dev_filename = 'best_dev_checkpoint' # Save flags next to checkpoints os.makedirs(FLAGS.checkpoint_dir, exist_ok=True) flags_file = os.path.join(FLAGS.checkpoint_dir, 'flags.txt') with open(flags_file, 'w') as fout: fout.write(FLAGS.flags_into_string()) initializer = tfv1.global_variables_initializer() with tfv1.Session(config=Config.session_config) as session: log_debug('Session opened.') # Loading or initializing loaded = False # Initialize training from a CuDNN RNN checkpoint if FLAGS.cudnn_checkpoint: if FLAGS.use_cudnn_rnn: log_error( 'Trying to use --cudnn_checkpoint but --use_cudnn_rnn ' 'was specified. The --cudnn_checkpoint flag is only ' 'needed when converting a CuDNN RNN checkpoint to ' 'a CPU-capable graph. If your system is capable of ' 'using CuDNN RNN, you can just specify the CuDNN RNN ' 'checkpoint normally with --checkpoint_dir.') exit(1) log_info('Converting CuDNN RNN checkpoint from {}'.format( FLAGS.cudnn_checkpoint)) ckpt = tfv1.train.load_checkpoint(FLAGS.cudnn_checkpoint) missing_variables = [] # Load compatible variables from checkpoint for v in tfv1.global_variables(): try: v.load(ckpt.get_tensor(v.op.name), session=session) except tf.errors.NotFoundError: missing_variables.append(v) # Check that the only missing variables are the Adam moment tensors if any('Adam' not in v.op.name for v in missing_variables): log_error( 'Tried to load a CuDNN RNN checkpoint but there were ' 'more missing variables than just the Adam moment ' 'tensors.') exit(1) # Initialize Adam moment tensors from scratch to allow use of CuDNN # RNN checkpoints. log_info('Initializing missing Adam moment tensors.') init_op = tfv1.variables_initializer(missing_variables) session.run(init_op) loaded = True tfv1.get_default_graph().finalize() if not loaded and FLAGS.load in ['auto', 'last']: loaded = try_loading(session, checkpoint_saver, checkpoint_filename, 'most recent') if not loaded and FLAGS.load in ['auto', 'best']: loaded = try_loading(session, best_dev_saver, best_dev_filename, 'best validation') if not loaded: if FLAGS.load in ['auto', 'init']: log_info('Initializing variables...') session.run(initializer) else: log_error( 'Unable to load %s model from specified checkpoint dir' ' - consider using load option "auto" or "init".' % FLAGS.load) sys.exit(1) def run_set(set_name, epoch, init_op, dataset=None): is_train = set_name == 'train' train_op = apply_gradient_op if is_train else [] feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict total_loss = 0.0 step_count = 0 step_summary_writer = step_summary_writers.get(set_name) checkpoint_time = time.time() # Setup progress bar class LossWidget(progressbar.widgets.FormatLabel): def __init__(self): progressbar.widgets.FormatLabel.__init__( self, format='Loss: %(mean_loss)f') def __call__(self, progress, data, **kwargs): data[ 'mean_loss'] = total_loss / step_count if step_count else 0.0 return progressbar.widgets.FormatLabel.__call__( self, progress, data, **kwargs) prefix = 'Epoch {} | {:>10}'.format( epoch, 'Training' if is_train else 'Validation') widgets = [ ' | ', progressbar.widgets.Timer(), ' | Steps: ', progressbar.widgets.Counter(), ' | ', LossWidget() ] suffix = ' | Dataset: {}'.format(dataset) if dataset else None pbar = create_progressbar(prefix=prefix, widgets=widgets, suffix=suffix).start() # Initialize iterator to the appropriate dataset session.run(init_op) # Batch loop while True: try: _, current_step, batch_loss, problem_files, step_summary = \ session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], feed_dict=feed_dict) except tf.errors.OutOfRangeError: break if problem_files.size > 0: problem_files = [ f.decode('utf8') for f in problem_files[..., 0] ] log_error( 'The following files caused an infinite (or NaN) ' 'loss: {}'.format(','.join(problem_files))) total_loss += batch_loss step_count += 1 pbar.update(step_count) step_summary_writer.add_summary(step_summary, current_step) if is_train and FLAGS.checkpoint_secs > 0 and time.time( ) - checkpoint_time > FLAGS.checkpoint_secs: checkpoint_saver.save(session, checkpoint_path, global_step=current_step) checkpoint_time = time.time() pbar.finish() mean_loss = total_loss / step_count if step_count > 0 else 0.0 return mean_loss, step_count log_info('STARTING Optimization') train_start_time = datetime.utcnow() best_dev_loss = float('inf') dev_losses = [] try: for epoch in range(FLAGS.epochs): # Training log_progress('Training epoch %d...' % epoch) train_loss, _ = run_set('train', epoch, train_init_op) log_progress('Finished training epoch %d - loss: %f' % (epoch, train_loss)) checkpoint_saver.save(session, checkpoint_path, global_step=global_step) if FLAGS.dev_files: # Validation dev_loss = 0.0 total_steps = 0 for csv, init_op in zip(dev_csvs, dev_init_ops): log_progress('Validating epoch %d on %s...' % (epoch, csv)) set_loss, steps = run_set('dev', epoch, init_op, dataset=csv) dev_loss += set_loss * steps total_steps += steps log_progress( 'Finished validating epoch %d on %s - loss: %f' % (epoch, csv, set_loss)) dev_loss = dev_loss / total_steps dev_losses.append(dev_loss) if dev_loss < best_dev_loss: best_dev_loss = dev_loss save_path = best_dev_saver.save( session, best_dev_path, global_step=global_step, latest_filename=best_dev_filename) log_info( "Saved new best validating model with loss %f to: %s" % (best_dev_loss, save_path)) # Early stopping if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps: mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1]) std_loss = np.std(dev_losses[-FLAGS.es_steps:-1]) dev_losses = dev_losses[-FLAGS.es_steps:] log_debug( 'Checking for early stopping (last %d steps) validation loss: ' '%f, with standard deviation: %f and mean: %f' % (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss)) if dev_losses[-1] > np.max(dev_losses[:-1]) or \ (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th): log_info( 'Early stop triggered as (for last %d steps) validation loss:' ' %f with standard deviation: %f and mean: %f' % (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss)) break except KeyboardInterrupt: pass log_info('FINISHED optimization in {}'.format(datetime.utcnow() - train_start_time)) log_debug('Session closed.')
def my_model_fn(features, labels, mode, params=None, config=None): """Estimator model function. Args: features: dictionary where keys are strings like "inputs" and "targets" and the values are the actual values of "inputs". See TPUEstimator's docs for more information labels: ignored argument mode: a tf.estimator.ModeKeys params: dictionary containing the key "context" config: ignored argument Returns: a TPUEstimatorSpec """ del labels, config global_step = tf.train.get_global_step() if use_tpu and "context" in params: ctx = params["context"] num_hosts = ctx.num_hosts host_placement_fn = ctx.tpu_host_placement_function device_list = [ host_placement_fn(host_id=t) for t in range(num_hosts) ] # TODO(ylc): Better estimation of replica cache size? replica_cache_size = 300 * 1000000 # 300M per replica # Worker 0 caches all the TPU binaries. worker0_mem = replica_cache_size * ctx.num_replicas devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1) var_placer = mtf.utils.BalancedVariablePlacer( device_list, devices_memeory_usage) # deprecated mesh_devices = [""] * mesh_shape.size physical_shape = list( params["context"].device_assignment.topology.mesh_shape) logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu( mesh_shape.to_integer_list, physical_shape) mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl( mesh_shape, layout_rules, mesh_devices, ctx.device_assignment, logical_to_physical=logical_to_physical) else: var_placer = None # deprecated mesh_devices = [""] * mesh_shape.size mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, mesh_devices) graph = mtf.Graph() mesh = mtf.Mesh(graph, "my_mesh", var_placer) mtf_features = {} for key, x in features.items(): outer_batch_dim = mtf.Dimension("outer_batch", outer_batch_size) batch_dim = mtf.Dimension("batch", batch_size // outer_batch_size) # Some auxiliary features may have been generated in packing. # The names of these new features are of the form # "<original_feature_name>_<suffix>", e.g. "inputs_segmentation". # We look up the lengths based on the original feature name, without # the "_<suffix>". feature_length = sequence_length[key.split("_")[0]] length_dim = mtf.Dimension("length", feature_length) ensemble_dims = ([mtf.Dimension("ensemble", ensemble_inputs)] if ensemble_inputs else []) feature_shape = mtf.Shape(ensemble_dims + [outer_batch_dim, batch_dim, length_dim]) x = tf.cast(features[key], tf.int32) x = tf.reshape(x, feature_shape.to_integer_list) if not use_tpu: tf.logging.info("feature %s : %s" % (key, x)) x = tf.Print(x, [x], "import feature %s" % key, summarize=1000, first_n=10) mtf_features[key] = mtf.import_fully_replicated(mesh, x, feature_shape, name=key) if key == "targets" or key == "codeprefixedtargets" or key == "controlcode": anon_targets = mtf.anonymize(mtf_features[key]) if mode == tf.estimator.ModeKeys.PREDICT: def _feature_shape(key): feature_length = sequence_length[key.split("_")[0]] return mtf.Shape([ mtf.Dimension("batch", batch_size), mtf.Dimension("length", feature_length) ]) mtf_features = { k: mtf.reshape(v, _feature_shape(k)) for k, v in six.iteritems(mtf_features) } inputs = mtf_features["inputs"] if attribute_embedding: attributes = mtf_features["attribute"] else: attributes = None if has_partial_sequences: controlcodes = mtf_features["controlcode"] else: controlcodes = None if predict_fn: mtf_samples = predict_fn(model=transformer_model, features=mtf_features, variable_dtype=get_variable_dtype()) elif isinstance(transformer_model, transformer.Unitransformer): # pad so that there is enough room for the targets inputs = mtf.pad(inputs, [0, sequence_length["targets"]], length_dim.name) mtf_samples = transformer_model.sample_autoregressive( inputs, variable_dtype=get_variable_dtype(), remove_partial_sequences=True) elif isinstance(transformer_model, Bitransformer_ll): mtf_samples = transformer_model.decode( inputs, attributes=attributes, controlcodes=controlcodes, has_partial_sequences=has_partial_sequences, remove_partial_sequences=remove_partial_sequences, variable_dtype=get_variable_dtype()) # elif isinstance( transformer_model, (transformer.Bitransformer, transformer.StudentTeacher)): mtf_samples = transformer_model.decode( inputs, variable_dtype=get_variable_dtype()) else: raise ValueError("unrecognized class") mtf_samples = mtf.anonymize(mtf_samples) inputs = mtf.anonymize(inputs) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=autostack) inputs = lowering.export_to_tf_tensor(inputs) outputs = lowering.export_to_tf_tensor(mtf_samples) predictions = {"inputs": inputs, "outputs": outputs} # When exporting a model, we need to communicate to TF-Serving that # master variables need to be copied to their slave slice variables. # Estimator uses a Scaffold's "local_init_op" for this purpose, so we # augment the default "local_init_op" here. # # The "ready_op" is also constructed here to ensure the variables # initialized by "local_init_op" are the same ones checked by "ready_op". # # WARNING: Any variables created outside of this model_fn() # (e.g. tpu_estimator/iterations_per_loop) will NOT be initialized nor # checked by these ops. def scaffold_fn(): return tf.train.Scaffold( local_init_op=tf.group( tf.train.Scaffold.default_local_init_op(), lowering.copy_masters_to_slices(), name="mtf_local_init_op"), ready_op=tf.concat([ tf.report_uninitialized_variables(), resources.report_uninitialized_resources() ], axis=0, name="mtf_ready_op")) return tpu_estimator.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions, scaffold_fn=scaffold_fn, prediction_hooks=[mtf.MtfRestoreHook(lowering)]) assert (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL) def logits_and_loss(mtf_features): """Compute logits and loss. Args: mtf_features: a dictionary Returns: logits: a mtf.Tensor loss: a mtf.Tensor """ if model_type == "lm": # TOTRY Adapt that to our case if "inputs" in mtf_features: mtf_features = _dynamic_text2self(mtf_features) _, _, length_dim = mtf_features["targets"].shape inputs = mtf.shift(mtf_features["targets"], offset=1, dim=length_dim, wrap=False) else: inputs = mtf_features["inputs"] if attribute_embedding: attributes = mtf_features["attribute"] else: attributes = None if control_codes: codeprefixedtargets = mtf_features["codeprefixedtargets"] else: codeprefixedtargets = None if isinstance(transformer_model, transformer.Unitransformer): position_kwargs = dict( sequence_id=mtf_features.get("targets_segmentation", None), position=mtf_features.get("targets_position", None), ) elif isinstance(transformer_model, transformer.Bitransformer ) or model_type == "bi_student_teacher": if control_codes: position_kwargs = dict( encoder_sequence_id=mtf_features.get( "inputs_segmentation", None), decoder_sequence_id=mtf_features.get( "codeprefixedtargets_segmentation", None), decoder_subsequence_id=mtf_features.get( "codeprefixedtargets_subsegmentation", None), encoder_position=mtf_features.get( "inputs_position", None), decoder_position=mtf_features.get( "codeprefixedtargets_position", None), ) else: position_kwargs = dict( encoder_sequence_id=mtf_features.get( "inputs_segmentation", None), decoder_sequence_id=mtf_features.get( "targets_segmentation", None), decoder_subsequence_id=mtf_features.get( "targets_subsegmentation", None), encoder_position=mtf_features.get( "inputs_position", None), decoder_position=mtf_features.get( "targets_position", None), ) else: raise ValueError("unrecognized class") if isinstance(transformer_model, Bitransformer_ll): if cycle_consistency_loss: logits_ae, l_ae = transformer_model.call_simple( inputs=inputs, targets=mtf_features["targets"], compute_loss=True, attributes=attributes, codeprefixedtargets=codeprefixedtargets, mode=mode, variable_dtype=get_variable_dtype(), **position_kwargs) if has_partial_sequences: controlcodes = mtf_features["controlcode"] else: controlcodes = None with gin.config_scope('training'): mtf_samples = transformer_model.decode( inputs, attributes=attributes, controlcodes=controlcodes, has_partial_sequences=has_partial_sequences, remove_partial_sequences=remove_partial_sequences, variable_dtype=get_variable_dtype()) # mtf_samples = mtf.anonymize(mtf_samples) outputs = mtf_samples logits_cycle, l_cycle = transformer_model.call_simple( inputs=outputs, targets=mtf_features["targets"], compute_loss=True, attributes=attributes, codeprefixedtargets=codeprefixedtargets, mode=mode, variable_dtype=get_variable_dtype(), **position_kwargs) loss_ae_cycle = lambda_ae * l_ae + lambda_cycle * l_cycle return logits_cycle, loss_ae_cycle else: return transformer_model.call_simple( inputs=inputs, targets=mtf_features["targets"], compute_loss=True, attributes=attributes, codeprefixedtargets=codeprefixedtargets, mode=mode, variable_dtype=get_variable_dtype(), **position_kwargs) else: return transformer_model.call_simple( inputs=inputs, targets=mtf_features["targets"], compute_loss=True, mode=mode, variable_dtype=get_variable_dtype(), num_microbatches=num_microbatches, **position_kwargs) if mode == tf.estimator.ModeKeys.TRAIN: num_microbatches = serialize_num_microbatches( batch_dim, sequence_length, mesh_shape, layout_rules) if num_microbatches > 1: def serialized_fn(mtf_features): return { "loss": (logits_and_loss(mtf_features)[1] / num_microbatches) } var_grads, loss_dict = mtf.serialize_training_step( mtf_features, serialized_fn, batch_dim, num_microbatches) loss = loss_dict["loss"] else: loss = logits_and_loss(mtf_features)[1] var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) if tpu_summaries: mtf.scalar_summary("loss", loss) if callable(learning_rate_schedule): # the following happens on CPU since TPU can't handle summaries. with mtf.utils.outside_all_rewrites(): learning_rate = learning_rate_schedule( step=tf.train.get_global_step()) tf.summary.scalar("learning_rate", learning_rate) else: learning_rate = learning_rate_schedule if isinstance(variable_filter, str): pattern = re.compile(variable_filter) variable_filter_fn = lambda v: pattern.search(v.name) elif variable_filter is None: variable_filter_fn = lambda v: True elif callable(variable_filter): variable_filter_fn = variable_filter else: raise ValueError( "variable_filter must be None, a string, or a callable function" ) trainable_vars = [ v for v in graph.trainable_variables if variable_filter_fn(v) ] trainable_var_grads = [ g for g, v in zip(var_grads, graph.trainable_variables) if variable_filter_fn(v) ] if len(trainable_vars) != len(graph.trainable_variables): tf.logging.info("Variables being trained:") tf.logging.info([v.name for v in trainable_vars]) tf.logging.info("Variables not being trained:") tf.logging.info([ v.name for v in graph.trainable_variables if not variable_filter_fn(v) ]) update_ops = optimizer(learning_rate=learning_rate).apply_grads( trainable_var_grads, trainable_vars) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=autostack) tf_loss = lowering.export_to_tf_tensor(loss) tf_loss = tf.cast(tf_loss, tf.float32) if not use_tpu: tf_loss = tf.Print( tf_loss, [tf_loss, tf.train.get_global_step()], "step, tf_loss") tf_update_ops = [ lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append(tf.assign_add(global_step, 1)) train_op = tf.group(tf_update_ops) if hasattr(transformer_model, "initialize"): with mtf.utils.outside_all_rewrites(): transformer_model.initialize() if tpu_summaries: # has to be outside of # with mtf.utils.outside_all_rewrites() host_call = mtf.utils.create_host_call(model_dir) mtf.utils.remove_summaries() else: host_call = None with mtf.utils.outside_all_rewrites(): if init_checkpoint: ckpt_vars = { v for v, _ in tf.train.list_variables(init_checkpoint) } global_vars = {v.op.name for v in tf.global_variables()} restore_vars = ckpt_vars.intersection(global_vars) tf.logging.info("Initializing variables from %s:", init_checkpoint) tf.logging.debug("\n".join(sorted(restore_vars))) tf.logging.info("Variables in %s but not in graph:", init_checkpoint) tf.logging.info("\n".join(sorted(ckpt_vars - global_vars))) tf.logging.info("Variables in graph but not in %s:", init_checkpoint) tf.logging.info("\n".join(sorted(global_vars - ckpt_vars))) tf.train.init_from_checkpoint(init_checkpoint, {v: v for v in restore_vars}) # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=keep_checkpoint_max, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( model_dir, save_steps=save_checkpoints_steps, saver=saver, listeners=[saver_listener]) gin_config_saver_hook = gin.tf.GinConfigSaverHook( model_dir, summarize_config=True, include_step_in_filename=False) if use_tpu: return tpu_estimator.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, host_call=host_call, training_hooks=[ restore_hook, saver_hook, gin_config_saver_hook, ]) else: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_chief_hooks=[ restore_hook, saver_hook, gin_config_saver_hook, ]) elif mode == tf.estimator.ModeKeys.EVAL: logits, loss = logits_and_loss(mtf_features) anon_logits = mtf.anonymize(logits) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=autostack) tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32) tf_loss = tf.cast(tf_loss, tf.float32) tf_logits = tf.cast(lowering.export_to_tf_tensor(anon_logits), tf.float32) def simple_metrics(logits, labels): """Simple metrics for teacher-forced eval.""" weights = tf.cast(tf.not_equal(labels, 0), tf.float32) xent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) predictions = tf.cast(tf.argmax(logits, axis=-1), labels.dtype) token_correct = tf.cast(tf.equal(predictions, labels), tf.float32) * weights sequence_correct = tf.to_float( tf.equal(tf.reduce_sum(token_correct, -1), tf.reduce_sum(weights, -1))) sequence_weights = tf.to_float( tf.not_equal(tf.reduce_sum(weights, -1), 0)) return { "neg_log_perplexity": tf.metrics.mean(-xent, weights), "token_accuracy": tf.metrics.mean(token_correct, weights), "sequence_accuracy": tf.metrics.mean(sequence_correct, sequence_weights) } labels = lowering.export_to_tf_tensor(anon_targets) eval_metrics = (simple_metrics, [tf_logits, labels]) with mtf.utils.outside_all_rewrites(): restore_hook = mtf.MtfRestoreHook(lowering) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
def __init__(self, sess, model, batch_size=1, confidence=CONFIDENCE, targeted=TARGETED, learning_rate=LEARNING_RATE, binary_search_steps=BINARY_SEARCH_STEPS, max_iterations=MAX_ITERATIONS, abort_early=ABORT_EARLY, initial_const=INITIAL_CONST, boxmin=-0.5, boxmax=0.5, x_window=0, y_window=0, window_size=-1): """ The L_2 optimized attack. This attack is the most efficient and should be used as the primary attack to evaluate potential defenses. Returns adversarial examples for the supplied model. confidence: Confidence of adversarial examples: higher produces examples that are farther away, but more strongly classified as adversarial. batch_size: Number of attacks to run simultaneously. targeted: True if we should perform a targetted attack, False otherwise. learning_rate: The learning rate for the attack algorithm. Smaller values produce better results but are slower to converge. binary_search_steps: The number of times we perform binary search to find the optimal tradeoff-constant between distance and confidence. max_iterations: The maximum number of iterations. Larger values are more accurate; setting too small will require a large learning rate and will produce poor results. abort_early: If true, allows early aborts if gradient descent gets stuck. initial_const: The initial tradeoff-constant to use to tune the relative importance of distance and confidence. If binary_search_steps is large, the initial constant is not important. boxmin: Minimum pixel value (default -0.5). boxmax: Maximum pixel value (default 0.5). """ if window_size == -1: window_size = model.image_size image_size, num_channels, num_labels = model.image_size, model.num_channels, model.num_labels self.sess = sess self.TARGETED = targeted self.LEARNING_RATE = learning_rate self.MAX_ITERATIONS = max_iterations self.BINARY_SEARCH_STEPS = binary_search_steps self.ABORT_EARLY = abort_early self.CONFIDENCE = confidence self.initial_const = initial_const self.batch_size = batch_size self.repeat = binary_search_steps >= 10 self.I_KNOW_WHAT_I_AM_DOING_AND_WANT_TO_OVERRIDE_THE_PRESOFTMAX_CHECK = False shape = (batch_size, window_size, window_size, num_channels) # the variable we're going to optimize over modifier = tf.Variable(np.zeros( shape, dtype=np.float32)) #qui ridimensionare per fare porzione # these are variables to be more efficient in sending data to tf self.timg = tf.Variable(np.zeros(shape), dtype=tf.float32) self.tlab = tf.Variable(np.zeros((batch_size, num_labels)), dtype=tf.float32) self.const = tf.Variable(np.zeros(batch_size), dtype=tf.float32) # and here's what we use to assign them self.assign_timg = tf.placeholder(tf.float32, shape) self.assign_tlab = tf.placeholder(tf.float32, (batch_size, num_labels)) self.assign_const = tf.placeholder(tf.float32, [batch_size]) # the resulting image, tanh'd to keep bounded from boxmin to boxmax self.boxmul = (boxmax - boxmin) / 2. self.boxplus = (boxmin + boxmax) / 2. ###################################################################### editing mask = tf.zeros((batch_size, image_size, image_size, num_channels), tf.float32) # Get input shapes modifier_shape = tf.shape(modifier) mask_shape = tf.shape(mask) # Make indices grid oo, ii, jj, kk = tf.meshgrid(tf.range(modifier_shape[0]), tf.range(modifier_shape[1]), tf.range(modifier_shape[2]), tf.range(modifier_shape[3]), indexing='ij') # Shift indices ii += y_window jj += x_window # Scatter update mask_to_apply = tf.tensor_scatter_nd_update( mask, tf.stack([oo, ii, jj, kk], axis=-1), modifier) self.newimg = tf.tanh(mask_to_apply + self.timg) * self.boxmul + self.boxplus ###################################################################### editing # prediction BEFORE-SOFTMAX of the model self.output = model.predict(self.newimg) # distance to the input data self.l2dist = tf.reduce_sum( tf.square(self.newimg - (tf.tanh(self.timg) * self.boxmul + self.boxplus)), [1, 2, 3]) # compute the probability of the label class versus the maximum other real = tf.reduce_sum((self.tlab) * self.output, 1) other = tf.reduce_max( (1 - self.tlab) * self.output - (self.tlab * 10000), 1) if self.TARGETED: # if targetted, optimize for making the other class most likely loss1 = tf.maximum(0.0, other - real + self.CONFIDENCE) else: # if untargeted, optimize for making this class least likely. loss1 = tf.maximum(0.0, real - other + self.CONFIDENCE) # sum up the losses self.loss2 = tf.reduce_sum(self.l2dist) self.loss1 = tf.reduce_sum(self.const * loss1) self.loss = self.loss1 + self.loss2 # Setup the adam optimizer and keep track of variables we're creating start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE) self.train = optimizer.minimize(self.loss, var_list=[modifier]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] # these are the variables to initialize when we run self.setup = [] self.setup.append(self.timg.assign(self.assign_timg)) self.setup.append(self.tlab.assign(self.assign_tlab)) self.setup.append(self.const.assign(self.assign_const)) self.init = tf.variables_initializer(var_list=[mask] + new_vars)
def build_example(label, param_dict_real, zip_path_label): """Build the model with parameter values set in param_dict_real. Args: label: Label of the model param_dict_real: Parameter dictionary (arguments to the factories make_graph and make_test_inputs) zip_path_label: Filename in the zip Returns: (tflite_model_binary, report) where tflite_model_binary is the serialized flatbuffer as a string and report is a dictionary with keys `toco_log` (log of toco conversion), `tf_log` (log of tf conversion), `toco` (a string of success status of the conversion), `tf` (a string success status of the conversion). """ np.random.seed(RANDOM_SEED) report = { "converter": report_lib.NOTRUN, "tf": report_lib.FAILED } # Build graph report["tf_log"] = "" report["converter_log"] = "" tf.reset_default_graph() with tf.Graph().as_default(): with tf.device("/cpu:0"): try: inputs, outputs = make_graph(param_dict_real) inputs = [x for x in inputs if x is not None] except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError, ValueError): report["tf_log"] += traceback.format_exc() return None, report sess = tf.Session() try: baseline_inputs, baseline_outputs = (make_test_inputs( param_dict_real, sess, inputs, outputs)) baseline_inputs = [ x for x in baseline_inputs if x is not None ] # Converts baseline inputs/outputs to maps. The signature input and # output names are set to be the same as the tensor names. input_names = [ _normalize_input_name(x.name) for x in inputs ] output_names = [ _normalize_output_name(x.name) for x in outputs ] baseline_input_map = dict( zip(input_names, baseline_inputs)) baseline_output_map = dict( zip(output_names, baseline_outputs)) except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError, ValueError): report["tf_log"] += traceback.format_exc() return None, report report["converter"] = report_lib.FAILED report["tf"] = report_lib.SUCCESS # Sorts the lists to make the order of input/output the same as order # of the signature names. # TODO(b/192473002): Remove sorting after TFLiteDriver can run with # signatures. inputs = sorted( inputs, key=lambda x: _normalize_input_name(x.name)) outputs = sorted( outputs, key=lambda x: _normalize_output_name(x.name)) # Builds a saved model with the default signature key. input_names, tensor_info_inputs = _get_tensor_info( inputs, "input_", _normalize_input_name) output_tensors, tensor_info_outputs = _get_tensor_info( outputs, "output_", _normalize_output_name) input_tensors = [(name, t.shape, t.dtype) for name, t in zip(input_names, inputs)] inference_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs=tensor_info_inputs, outputs=tensor_info_outputs, method_name="op_test")) saved_model_dir = tempfile.mkdtemp("op_test") saved_model_tags = [tf.saved_model.tag_constants.SERVING] signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY builder = tf.saved_model.builder.SavedModelBuilder( saved_model_dir) builder.add_meta_graph_and_variables( sess, saved_model_tags, signature_def_map={ signature_key: inference_signature, }, strip_default_attrs=True) builder.save(as_text=False) # pylint: disable=g-long-ternary graph_def = freeze_graph( sess, tf.global_variables() + inputs + outputs) if use_frozen_graph else sess.graph_def if "split_tflite_lstm_inputs" in param_dict_real: extra_toco_options.split_tflite_lstm_inputs = param_dict_real[ "split_tflite_lstm_inputs"] tflite_model_binary, toco_log = options.tflite_convert_function( options, saved_model_dir, input_tensors, output_tensors, extra_toco_options=extra_toco_options, test_params=param_dict_real) report["converter"] = (report_lib.SUCCESS if tflite_model_binary is not None else report_lib.FAILED) report["converter_log"] = toco_log if options.save_graphdefs: zipinfo = zipfile.ZipInfo(zip_path_label + ".pbtxt") archive.writestr(zipinfo, text_format.MessageToString(graph_def), zipfile.ZIP_DEFLATED) if tflite_model_binary: if options.make_edgetpu_tests: # Set proper min max values according to input dtype. baseline_input_map, baseline_output_map = generate_inputs_outputs( tflite_model_binary, min_value=0, max_value=255) zipinfo = zipfile.ZipInfo(zip_path_label + ".bin") archive.writestr(zipinfo, tflite_model_binary, zipfile.ZIP_DEFLATED) # TODO(b/192473002): Remove sorting after TFLiteDriver can run with # signatures. baseline_input_map = collections.OrderedDict( sorted(baseline_input_map.items())) baseline_output_map = collections.OrderedDict( sorted(baseline_output_map.items())) example = { "inputs": baseline_input_map, "outputs": baseline_output_map } example_fp = StringIO() write_examples(example_fp, [example]) zipinfo = zipfile.ZipInfo(zip_path_label + ".inputs") archive.writestr(zipinfo, example_fp.getvalue(), zipfile.ZIP_DEFLATED) example_fp2 = StringIO() write_test_cases(example_fp2, zip_path_label + ".bin", [example]) zipinfo = zipfile.ZipInfo(zip_path_label + "_tests.txt") archive.writestr(zipinfo, example_fp2.getvalue(), zipfile.ZIP_DEFLATED) zip_manifest_label = zip_path_label + " " + label if zip_path_label == label: zip_manifest_label = zip_path_label zip_manifest.append(zip_manifest_label + "\n") return tflite_model_binary, report
def _load_checkpoint(session, checkpoint_path): # Load the checkpoint and put all variables into loading list # we will exclude variables we do not wish to load and then # we will initialize them instead ckpt = tfv1.train.load_checkpoint(checkpoint_path) vars_in_ckpt = frozenset(ckpt.get_variable_to_shape_map().keys()) load_vars = set(tfv1.global_variables()) init_vars = set() # We explicitly allow the learning rate variable to be missing for backwards # compatibility with older checkpoints. lr_var = set(v for v in load_vars if v.op.name == 'learning_rate') if lr_var and ('learning_rate' not in vars_in_ckpt or FLAGS.force_initialize_learning_rate): assert len(lr_var) <= 1 load_vars -= lr_var init_vars |= lr_var if FLAGS.load_cudnn: # Initialize training from a CuDNN RNN checkpoint # Identify the variables which we cannot load, and set them # for initialization missing_vars = set() for v in load_vars: if v.op.name not in vars_in_ckpt: log_warn('CUDNN variable not found: %s' % (v.op.name)) missing_vars.add(v) init_vars.add(v) load_vars -= init_vars # Check that the only missing variables (i.e. those to be initialised) # are the Adam moment tensors, if they aren't then we have an issue missing_var_names = [v.op.name for v in missing_vars] if any('Adam' not in v for v in missing_var_names): log_error('Tried to load a CuDNN RNN checkpoint but there were ' 'more missing variables than just the Adam moment ' 'tensors. Missing variables: {}'.format(missing_var_names)) sys.exit(1) if FLAGS.drop_source_layers > 0: # This transfer learning approach requires supplying # the layers which we exclude from the source model. # Say we want to exclude all layers except for the first one, # then we are dropping five layers total, so: drop_source_layers=5 # If we want to use all layers from the source model except # the last one, we use this: drop_source_layers=1 if FLAGS.drop_source_layers >= 6: log_warn('The checkpoint only has 6 layers, but you are trying to drop ' 'all of them or more than all of them. Continuing and ' 'dropping only 5 layers.') FLAGS.drop_source_layers = 5 dropped_layers = ['2', '3', 'lstm', '5', '6'][-1 * int(FLAGS.drop_source_layers):] # Initialize all variables needed for DS, but not loaded from ckpt for v in load_vars: if any(layer in v.op.name for layer in dropped_layers): init_vars.add(v) load_vars -= init_vars for v in sorted(load_vars, key=lambda v: v.op.name): log_info('Loading variable from checkpoint: %s' % (v.op.name)) v.load(ckpt.get_tensor(v.op.name), session=session) for v in sorted(init_vars, key=lambda v: v.op.name): log_info('Initializing variable: %s' % (v.op.name)) session.run(v.initializer)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) # MTF setup. graph = mtf.Graph() mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) layout_rules = mtf.convert_to_layout_rules(FLAGS.layout) ctx = params["context"] num_hosts = ctx.num_hosts host_placement_fn = ctx.tpu_host_placement_function device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)] tf.logging.info("device_list = %s" % device_list, ) replica_cache_size = 300 * 1000000 # 300M per replica # Worker 0 caches all the TPU binaries. worker0_mem = replica_cache_size * ctx.num_replicas devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1) var_placer = mtf.utils.BalancedVariablePlacer(device_list, devices_memeory_usage) mesh_devices = [""] * mesh_shape.size physical_shape = list(ctx.device_assignment.topology.mesh_shape) logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu( mesh_shape.to_integer_list, physical_shape) mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl( mesh_shape, layout_rules, mesh_devices, ctx.device_assignment, logical_to_physical=logical_to_physical) mesh = mtf.Mesh(graph, "bert_mesh", var_placer) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = tf.squeeze(features["next_sentence_labels"], 1) batch_size = input_ids.get_shape()[0].value batch_dim = mtf.Dimension("batch", batch_size) seq_length = input_ids.get_shape()[1].value seq_dim = mtf.Dimension("seq", seq_length) max_predictions_per_seq = masked_lm_positions.get_shape()[1].value max_predictions_per_seq_dim = mtf.Dimension("max_pred_seq", max_predictions_per_seq) mtf_input_ids = mtf.import_tf_tensor(mesh, input_ids, [batch_dim, seq_dim]) mtf_input_mask = mtf.import_tf_tensor(mesh, input_mask, [batch_dim, seq_dim]) mtf_segment_ids = mtf.import_tf_tensor(mesh, segment_ids, [batch_dim, seq_dim]) mtf_masked_lm_positions = mtf.import_tf_tensor( mesh, masked_lm_positions, [batch_dim, max_predictions_per_seq_dim]) mtf_masked_lm_ids = mtf.import_tf_tensor( mesh, masked_lm_ids, [batch_dim, max_predictions_per_seq_dim]) mtf_masked_lm_weights = mtf.import_tf_tensor( mesh, masked_lm_weights, [batch_dim, max_predictions_per_seq_dim]) mtf_next_sentence_labels = mtf.import_tf_tensor( mesh, next_sentence_labels, [batch_dim]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = bert_lib.BertModel(config=bert_config, is_training=is_training, input_ids=mtf_input_ids, input_mask=mtf_input_mask, token_type_ids=mtf_segment_ids, layout=layout_rules, mesh_shape=mesh_shape) (masked_lm_loss, masked_lm_example_loss, masked_lm_logits) = model.get_masked_lm_output( mtf_masked_lm_positions, mtf_masked_lm_ids, mtf_masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_logits ) = model.get_next_sentence_output(mtf_next_sentence_labels) extra_loss = model.get_extra_loss() total_loss = masked_lm_loss + next_sentence_loss total_loss = mtf.anonymize(total_loss) masked_lm_example_loss = mtf.anonymize(masked_lm_example_loss) masked_lm_logits = mtf.anonymize(masked_lm_logits) next_sentence_example_loss = mtf.anonymize(next_sentence_example_loss) next_sentence_logits = mtf.anonymize(next_sentence_logits) # TRAIN mode if mode == tf.estimator.ModeKeys.TRAIN: _, update_ops = optimization_lib.create_optimizer( total_loss + extra_loss, learning_rate, num_train_steps, num_warmup_steps, optimizer=FLAGS.optimizer, clip_gradients=FLAGS.clip_gradients) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_loss = tf.to_float(lowering.export_to_tf_tensor(total_loss)) if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_global_step() tf_update_ops = [ lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append(tf.assign_add(global_step, 1)) tf.logging.info("tf_update_ops: {}".format(tf_update_ops)) train_op = tf.group(tf_update_ops) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_logits, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_logits, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_logits = tf.reshape(masked_lm_logits, [-1, masked_lm_logits.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_logits, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_logits = tf.reshape( next_sentence_logits, [-1, next_sentence_logits.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_logits, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ lowering.export_to_tf_tensor(masked_lm_example_loss), lowering.export_to_tf_tensor(masked_lm_logits), masked_lm_ids, masked_lm_weights, lowering.export_to_tf_tensor(next_sentence_example_loss), lowering.export_to_tf_tensor(next_sentence_logits), next_sentence_labels ]) with mtf.utils.outside_all_rewrites(): # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) if mode == tf.estimator.ModeKeys.TRAIN: saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( FLAGS.output_dir, save_steps=1000, saver=saver, listeners=[saver_listener]) return tf.estimator.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_hooks=[restore_hook, saver_hook]) elif mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
def _initialize_all_variables(session): init_vars = tfv1.global_variables() for v in init_vars: session.run(v.initializer)
def run_training(): if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate) saver = tf.train.Saver(tf.global_variables()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess) # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(init_op) start_epoch = 0 checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) if checkpoint: saver.restore(sess, checkpoint) print("## restore from the checkpoint {0}".format(checkpoint)) start_epoch += int(checkpoint.split('-')[-1]) print('## start training...') try: n_chunk = len(poems_vector) // FLAGS.batch_size for epoch in range(start_epoch, FLAGS.epochs): n = 0 for batch in range(n_chunk): loss, _, _ = sess.run( [ end_points['total_loss'], end_points['last_state'], end_points['train_op'] ], feed_dict={ input_data: batches_inputs[n], output_targets: batches_outputs[n] }) n += 1 print('Epoch: %d, batch: %d, training loss: %.6f' % (epoch, batch, loss)) if epoch % 6 == 0: saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch) except KeyboardInterrupt: print('## Interrupt manually, try saving checkpoint for now...') saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_prefix), global_step=epoch) print( '## Last epoch were saved, next time will start from epoch {}.' .format(epoch))
def build_graph(bert_config, opts, iterations_per_step=1, is_training=True, feed_name=None): """Build the graph for training. Args: bert_config: configuration for the BERT model. opts: a dictionary containing all global options. iterations_per_step: number of iterations per step is_training (bool): if true return a graph with trainable variables. feed_name: name of the IPU infeed. Returns: a GraphOps containing a BERT graph and session prepared for inference or training. """ train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=is_training), feed_name=feed_name + "_in", replication_factor=opts['replicas']) outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue( feed_name=feed_name + "_out", replication_factor=opts['replicas']) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( bert_config, train_iterator, outfeed_queue, opts, learning_rate, iterations_per_step, is_training=is_training) outfeed = outfeed_queue.dequeue() bert_logging.print_trainable_variables(opts['logs_path']) model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) model_and_optimiser_variables = tf.global_variables() restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_ckpt'] else model_variables) # We store two savers: one for the standard training and another one for the best checkpoint savers = { "train_saver": tf.train.Saver(var_list=model_variables if opts['ckpt_model_only'] else model_and_optimiser_variables, name='latest', max_to_keep=5), "best_saver": tf.train.Saver(var_list=model_variables if opts['ckpt_model_only'] else model_and_optimiser_variables, name='best', max_to_keep=1) } ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # Calculate number of IPUs required for pretraining pipeline. num_embedding_ipu = { 'two_ipus': 2, 'same_ipu': 1, 'same_as_hidden_layers': 0 }[opts['embeddings_placement']] num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage) num_ipus_required = opts['replicas'] * next_power_of_two( num_hidden_layer_stages + num_embedding_ipu) # Configure the IPU options. ipu_options = get_ipu_config( fp_exceptions=opts["fp_exceptions"], stochastic_rounding=opts['stochastic_rounding'], xla_recompute=opts["xla_recompute"], available_memory_proportion=opts['available_memory_proportion'], disable_graph_outlining=opts["disable_graph_outlining"], num_ipus_required=num_ipus_required, max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], partials_type=opts['partials_type']) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, savers, restore, tvars)
def test_simple_model_shapes(self): # Shape = [4, 2, 3]. input_features = tf.constant([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]], [[13.0, 14.0, 15.0], [16.0, 17.0, 18.0]], [[19.0, 20.0, 21.0], [22.0, 23.0, 24.0]]]) output_sizes = {'a': 8, 'b': [4, 3]} outputs, activations = models.simple_model(input_features, output_sizes, sequential_inputs=False, is_training=True, num_bottleneck_nodes=16) expected_global_variable_shapes = { 'SimpleModel/InputFC/Linear/weight:0': ([3, 1024]), 'SimpleModel/InputFC/Linear/bias:0': ([1024]), 'SimpleModel/InputFC/BatchNorm/gamma:0': ([1024]), 'SimpleModel/InputFC/BatchNorm/beta:0': ([1024]), 'SimpleModel/InputFC/BatchNorm/moving_mean:0': ([1024]), 'SimpleModel/InputFC/BatchNorm/moving_variance:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_0/Linear/weight:0': ([1024, 1024]), 'SimpleModel/FullyConnectedBlock_0/FC_0/Linear/bias:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/gamma:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/beta:0': ([1024 ]), 'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/moving_mean:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/moving_variance:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_1/Linear/weight:0': ([1024, 1024]), 'SimpleModel/FullyConnectedBlock_0/FC_1/Linear/bias:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/gamma:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/beta:0': ([1024 ]), 'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/moving_mean:0': ([1024]), 'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/moving_variance:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_0/Linear/weight:0': ([1024, 1024]), 'SimpleModel/FullyConnectedBlock_1/FC_0/Linear/bias:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/gamma:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/beta:0': ([1024 ]), 'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/moving_mean:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/moving_variance:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_1/Linear/weight:0': ([ 1024, 1024 ]), 'SimpleModel/FullyConnectedBlock_1/FC_1/Linear/bias:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/gamma:0': ([ 1024 ]), 'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/beta:0': ([1024 ]), 'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/moving_mean:0': ([1024]), 'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/moving_variance:0': ([1024]), 'SimpleModel/BottleneckLogits/weight:0': ([1024, 16]), 'SimpleModel/BottleneckLogits/bias:0': ([16]), 'SimpleModel/OutputLogits/a/weight:0': ([16, 8]), 'SimpleModel/OutputLogits/a/bias:0': ([8]), 'SimpleModel/OutputLogits/b/weight:0': ([16, 12]), 'SimpleModel/OutputLogits/b/bias:0': ([12]), } self.assertDictEqual( {var.name: var.shape.as_list() for var in tf.global_variables()}, expected_global_variable_shapes) self.assertCountEqual(outputs.keys(), ['a', 'b']) self.assertAllEqual(outputs['a'].shape.as_list(), [4, 2, 8]) self.assertAllEqual(outputs['b'].shape.as_list(), [4, 2, 4, 3]) self.assertCountEqual(activations.keys(), ['base_activations', 'bottleneck_activations']) self.assertAllEqual(activations['base_activations'].shape.as_list(), [4, 2, 1024]) self.assertAllEqual( activations['bottleneck_activations'].shape.as_list(), [4, 2, 16])
def test_simple_conv3d(self, threshold, expected_alive): # TODO(e1) remove when gamma is supported. # This test works if reshape not set to be a handled by # leaf_op_handler.LeafOpHandler() in op_handlers.py. However this changes # brakes other tests for reasons to be investigated. if SKIP_GAMMA_CONV3D: return def fused_batch_norm3d(*args, **kwargs): if args: inputs = args[0] args = args[1:] else: inputs = kwargs.pop('inputs') shape = inputs.shape # inputs is assumed to be NHWTC (T is for time). batch_size = shape[0] # space x time cube is reshaped to be 2D with dims: # H, W, T --> H, W * T # the idea is that batch norm only needs this to collect spacial stats. target_shape = [ batch_size, shape[1], shape[2] * shape[3], shape[4] ] inputs = tf.reshape(inputs, target_shape, name='Reshape/to2d') normalized = slim.batch_norm(inputs, *args, **kwargs) return tf.reshape(normalized, shape, name='Reshape/to3d') gamma_val = [0.5, 0.3, 0.2] num_inputs = 4 batch_size = 2 video = tf.zeros([batch_size, 8, 8, 8, num_inputs]) kernel = [5, 5, 5] num_outputs = 3 net = slim.conv3d(video, num_outputs, kernel, padding='SAME', normalizer_fn=fused_batch_norm3d, normalizer_params={ 'scale': True, 'fused': True }, scope='vconv1') self.assertLen(net.shape.as_list(), 5) shape = net.shape.as_list() # The number of applications is the number of elements in the [HWT] tensor. num_applications = shape[1] * shape[2] * shape[3] application_cost = num_inputs * kernel[0] * kernel[1] * kernel[2] name_to_var = {v.op.name: v for v in tf.global_variables()} flop_reg = flop_regularizer.GammaFlopsRegularizer( [ net.op, tf.get_default_graph().get_operation_by_name('vconv1/Conv3D') ], threshold, force_group=[ 'vconv1/Reshape/to3d|vconv1/Reshape/to2d|vconv1/Conv3D' ]) gamma = name_to_var['vconv1/BatchNorm/gamma'] with self.session(): tf.global_variables_initializer().run() gamma.assign(gamma_val).eval() self.assertAllClose( flop_reg.get_cost(), 2 * expected_alive * num_applications * application_cost) raw_cost = 2 * num_outputs * num_applications * application_cost self.assertAllClose(flop_reg.get_regularization_term(), raw_cost * np.mean(gamma_val))