def insert_quant_op_for_weights(self, w_bit_dict): """Insert quantization operation for weights Args: * wewight_bit_dict: A dict with (key: matmul_op_name, value: quant_bits) """ for op in self.matmul_ops: w = op.inputs[1] prefix = prefix_filter(op.name) qw = self.__uniform_quantize(w, w_bit_dict[op.name], 'weight', prefix) weight_fn = {'MatMul': tf.matmul, 'Conv2D': tf.nn.conv2d, 'DepthwiseConv2dNative': tf.nn.depthwise_conv2d} is_conv_fn = lambda x: 'Conv' in x.type try: if is_conv_fn(op): strides = op.get_attr('strides') padding = op.get_attr('padding') qw_op = weight_fn[op.type](op.inputs[0], qw, strides, padding).op else: # fc layers qw_op = weight_fn[op.type](op.inputs[0], qw).op self.quantized_matmul_ops.append(qw_op) except KeyError: raise NotImplementedError("Unrecognied Mul op, \ try to add it into matmul_typs for quantization") # replace input for wop, qwop in zip(self.matmul_ops, self.quantized_matmul_ops): old_sgv = ge.sgv(wop) new_sgv = ge.sgv(qwop) ge.reroute_inputs(new_sgv, old_sgv)
def insert_quant_op_for_activations(self, act_bit_dict): """ Insert quantization operation for activation Args: * act_bit_dict: A dict with (key: act_op_name, value: act_bits) """ activation_fn = {'Relu': tf.nn.relu, 'Tanh': tf.nn.tanh, 'Softplus': tf.nn.softplus, 'Sigmoid': tf.nn.sigmoid, 'Relu6': tf.nn.relu6} for op in self.activation_ops: old_sgv = ge.sgv(op) input_ = old_sgv.inputs[0] if op.type in self.support_act_types: try: tmp_input_ = activation_fn[op.type](input_) except KeyError: raise NotImplementedError("The activation_fn needs to include %s manually" % op.type) prefix = prefix_filter(op.name) qa = self.__uniform_quantize(tmp_input_, act_bit_dict[op.name], 'activation', prefix) new_sgv = ge.sgv(qa.op) ge.reroute_outputs(new_sgv, old_sgv) self.quantized_activation_ops.append(qa.op) else: raise ValueError("Unknown activation mode, you may add it manually here")
def convert_consts_to_var(graph_def): graph = get_graph_from(graph_def) all_const_ops = set(i.name for i in graph.get_operations() if i.type == "Const") const_names_list = list(all_const_ops - set(get_white_list(graph))) const_var_names_pairs = [] ops_to_delete = [] with graph.as_default(): preexisting_vars = [ tf.get_variable(i.name, i.outputs[0].shape) for i in graph.get_operations() if i.type == "VariableV2" or i.type == "Variable" ] var_list = [] for name in const_names_list: tensor = graph.get_operation_by_name(name).outputs[0] with tf.compat.v1.Session() as sess: t_value = sess.run(tensor) t_name = "{}_mpc_const_var".format(name) var = tf.compat.v1.Variable(t_value, name=t_name) var_read_op_name = var.to_proto().snapshot_name[:-2] const_var_names_pairs.append((name, var_read_op_name)) var_list.append(var) for const_name, var_read_name in const_var_names_pairs: const_op = graph.get_operation_by_name(const_name) var_op = graph.get_operation_by_name(var_read_name) ge.swap_outputs(ge.sgv(const_op), ge.sgv(var_op)) ops_to_delete.append(const_op) tf.compat.v1.variables_initializer( var_list + preexisting_vars, "init_constvars" ) return delete_nodes(graph.as_graph_def(), ops_to_delete)
def replace_read_ops(loss_or_losses, var_list): """ Replaces read ops of each variable in `vars` with new read ops obtained from `read_value()`, thus forcing to read the most up-to-date values of the variables (which might incur copies across devices). The graph is seeded from the tensor(s) `loss_or_losses`. """ # ops between var ops and the loss ops = set(ge.get_walks_intersection_ops([var.op for var in var_list], loss_or_losses)) if not ops: # loss_or_losses doesn't depend on any var in var_list, so there is nothiing to replace return # filter out variables that are not involved in computing the loss var_list = [var for var in var_list if var.op in ops] # assume that for each variable, the only op required to compute the loss # is a read op, and there is exactly one per variable read_ops = [] for var in var_list: output, = var.op.outputs read_op, = set(output.consumers()) & ops read_ops.append(read_op) for var, read_op in zip(var_list, read_ops): with tf.name_scope('/'.join(read_op.name.split('/')[:-1])): with tf.device(read_op.device): read_t, = read_op.outputs consumer_ops = set(read_t.consumers()) & ops # consumer_sgv might have multiple inputs, but we only care # about replacing the input that is read_t consumer_sgv = ge.sgv(consumer_ops) consumer_sgv = consumer_sgv.remap_inputs([list(consumer_sgv.inputs).index(read_t)]) ge.connect(ge.sgv(var.read_value().op), consumer_sgv)
def convert_consts_to_var(graph, const_names_list): const_var_names_pairs = [] ops_to_delete = [] with graph.as_default(): preexisting_vars = [ tf.get_variable(i.name, i.outputs[0].shape) for i in graph.get_operations() if i.type == "VariableV2" or i.type == "Variable" ] var_list = [] for name in const_names_list: tensor = graph.get_operation_by_name(name).outputs[0] with tf.Session() as sess: t_value = sess.run(tensor) t_name = '{}_mpc_const_var'.format(name) var = tf.Variable(t_value, name=t_name) const_var_names_pairs.append((name, t_name)) var_list.append(var) for const_name, var_name in const_var_names_pairs: const_op = graph.get_operation_by_name(const_name) var_op = graph.get_operation_by_name('{}/read'.format(var_name)) ge.swap_outputs(ge.sgv(const_op), ge.sgv(var_op)) ops_to_delete.append(const_op) tf.compat.v1.variables_initializer(var_list + preexisting_vars, 'init_constvars') return delete_nodes(graph, ops_to_delete)
def test_multiswap(self): with self.graph.as_default(): a3 = constant_op.constant(3.0, shape=[2], name="a3") ge.swap_ios(ge.sgv(a3.op).remap_outputs([0, 0]), ge.sgv(self.a0.op, self.a1.op)) self.assertTrue(match.OpMatcher("c0").input_ops("a3", "b0")(self.c0.op)) self.assertTrue(match.OpMatcher("c1").input_ops("a3", "b1")(self.c1.op))
def replace_nodes_with_identity(graph, nop_splits): with graph.as_default(): for split in nop_splits: inp_var = split.inputs[1] identity = tf.identity(inp_var).op ge.swap_outputs(ge.sgv(split), ge.sgv(identity)) return graph
def tensor_swapin_and_out(g, origin_op, swapin_op): global added_control all_ops = g.get_operations() #find the origin_op's output tensor name origin_op_name = origin_op.values()[0].name added_control = False #search the to_swapin_op which use for op in all_ops: for i in range(len(op.inputs)): if ((op.inputs[i].name == origin_op_name) and ("_grad" in op.name)): print("gradient op.name:", op.name) """ ('op.name:', u'layer1/L1_SwapOut') ('op.name:', u'layer2/MatMul') ('op.name:', u'optimizer/gradients/layer1/Sigmoid_grad/SigmoidGrad') """ #Use connect and remap function to reconnect ge.connect(ge.sgv(swapin_op), ge.sgv(op).remap_inputs([i])) # FIXME: # obviously we cannot add more than 1 control dependency for swap_in op if added_control is False: print("Control Dependency==> swapin_op:", swapin_op, "op:", op) add_control_dependency(all_ops, swapin_op, op)
def _connect_ops(self, src_op, dest_op, remap_inputs=False, remap_outputs=False, idx=None, disconnect_first=False): """A wrapper of `tensorflow.contrib.graph_editor.connect`. This method does an in-place modification to the graph. Args: src_op: a `tf.Operation`. dest_op: a `tf.Operation`. remap_inputs: remap the input of `dest_op` or not. remap_outputs: remap the output of `src_op` or not. idx: index of input or output tensor. disconnect_first: True means the current outputs of sgv0 are disconnected. """ src_sgv = ge.sgv(src_op, graph=self._graph) dest_sgv = ge.sgv(dest_op, graph=self._graph) if remap_outputs: src_sgv = src_sgv.remap_outputs([idx]) if remap_inputs: dest_sgv = dest_sgv.remap_inputs([idx]) ge.connect(src_sgv, dest_sgv, disconnect_first)
def test_multiswap(self): with self.graph.as_default(): a3 = tf.constant(3.0, shape=[2], name="a3") ge.reroute.swap( ge.sgv(a3.op).remap_outputs([0, 0]), ge.sgv(self.a0.op, self.a1.op)) self.assertTrue(ge.matcher("c0").input_ops("a3", "b0")(self.c0.op)) self.assertTrue(ge.matcher("c1").input_ops("a3", "b1")(self.c1.op))
def build_pb_fact(pb_location, main_location, breath, quant, lvl): graph = load_graph(pb_location) W1 = graph.get_tensor_by_name('prefix/w_in:0') matmul = graph.get_tensor_by_name('prefix/MatMul:0') bias = graph.get_tensor_by_name('prefix/b_in:0') add = graph.get_tensor_by_name('prefix/add:0') reshape = graph.get_tensor_by_name('prefix/Reshape:0') # #remove all conncetions from matmul ge.detach(ge.sgv(matmul.op)) with tf.Session(graph=graph) as sess: # os.system("mkdir " + main_location + breath + "/" + quant + "/fact_" + str(lvl)) # for op in sess.graph.get_operations(): # print(op.name) W = W1.eval() u, s, v, ss = svd_compress_gs(W, lvl) logEntry("structural_similarity == > " + str(ss)) u1 = tf.matmul(reshape, u, name="prefix/u1") s1 = tf.matmul(u1, s, name="prefix/s1") v1 = tf.matmul(s1, v, name="prefix/v1") ge.connect(ge.sgv(v1.op), ge.sgv(add.op).remap_inputs([0])) sess.run(tf.variables_initializer([tf.Variable(5, name="dummy" + str(lvl))])) saver = tf.train.Saver() # save log for tensorboad LOGDIR = main_location + '/LOG' train_writer = tf.summary.FileWriter(LOGDIR) train_writer.add_graph(sess.graph) train_writer.close() # save the freezed model os.system("mkdir " + main_location + "pb") tf.train.write_graph(sess.graph_def, main_location + 'pb/', "RNN_" + breath + "_" + quant + "_fact_" + str(lvl) + ".pbtxt") saver.save(sess, save_path=main_location + "model.ckpt") input_graph_path = main_location + '/pb/' + "RNN_" + breath + "_" + quant + "_fact_" + str(lvl) + ".pbtxt" checkpoint_path = main_location + "model.ckpt" restore_op_name = "save/restore_all" filename_tensor_name = "save/Const:0" output_frozen_graph_name = main_location + 'pb/' + "RNN_" + breath + "_" + quant + "_fact_" + str(lvl) + ".pb" logEntry("Start Freezing the graph") freeze_graph.freeze_graph(input_graph_path, input_saver="", input_binary=False, input_checkpoint=checkpoint_path, output_node_names="prefix/y_", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0", output_graph=output_frozen_graph_name, clear_devices=True, initializer_nodes="") logEntry("End Freezing the graph") sess.close()
def _connect_ops(src_op, dest_op, remap_inputs=False, remap_outputs=True): src_sgv = ge.sgv(src_op, graph=tf.get_default_graph()) dest_sgv = ge.sgv(dest_op, graph=tf.get_default_graph()) if remap_outputs: src_sgv = src_sgv.remap_outputs([0]) if remap_inputs: dest_sgv = dest_sgv.remap_inputs([0]) ge.connect(src_sgv, dest_sgv)
def replace_node_with_const(node): print("Trying to execute node {}".format(node.name)) graph = node.graph with graph.as_default(): const_lists = [] with tf.Session() as sess: for out_t in node.outputs: const_val = sess.run(out_t) const_op = tf.constant(const_val).op const_lists.append(const_op) ge.swap_outputs(ge.sgv(node), ge.sgv(const_lists))
def test_connect(self): """Test for ge.connect.""" with self.graph.as_default(): x = tf.constant([1., 1.], shape=[2], name="x") y = tf.constant([2., 2.], shape=[2], name="y") z = tf.add(x, y, name="z") sgv = ge.sgv(x.op, y.op, z.op) ge.connect(sgv, ge.sgv(self.e.op).remap_inputs([0])) self.assertTrue(ge.matcher("^foo/bar/e$").input_ops("^z$", "foo/d$") (self.e.op))
def test_subgraph(self): sgv = ge.sgv(self.graph) self.assertEqual(list(sgv.outputs), [self.e, self.h]) self.assertEqual(list(sgv.inputs), []) self.assertEqual(len(sgv.ops), 8) sgv = ge.sgv(self.f.op, self.g.op) self.assertEqual(list(sgv.outputs), [self.f, self.g]) self.assertEqual(list(sgv.inputs), [self.c, self.d, self.a]) sgv = ge.sgv_scope("foo/bar", graph=self.graph) self.assertEqual(list(sgv.ops), [self.e.op, self.f.op, self.g.op, self.h.op])
def test_subgraph(self): sgv = ge.sgv(self.graph) self.assertEqual(list(sgv.outputs), [self.e, self.h]) self.assertEqual(list(sgv.inputs), []) self.assertEqual(len(sgv.ops), 8) sgv = ge.sgv(self.f.op, self.g.op) self.assertEqual(list(sgv.outputs), [self.f, self.g]) self.assertEqual(list(sgv.inputs), [self.c, self.d, self.a]) sgv = ge.sgv_scope("foo/bar", graph=self.graph) self.assertEqual( list(sgv.ops), [self.e.op, self.f.op, self.g.op, self.h.op])
def test_detach(self): """Test for ge.detach.""" sgv = ge.sgv(self.c.op, self.a.op) control_outputs = ge.util.ControlOutputs(self.graph) ge.detach(sgv, control_inputs=control_outputs) # make sure the detached graph is as expected. self.assertTrue(ge.matcher("^foo/c$") .input_ops("geph__a_0", "geph__b_0")(self.c.op))
def test_remove_unused_ops(self): sgv = ge.sgv(self.graph) self.assertEqual(list(sgv.outputs), [self.e, self.h]) self.assertEqual(len(sgv.ops), 8) sgv = sgv.remap_outputs(new_output_indices=[1]).remove_unused_ops() self.assertEqual(list(sgv.outputs), [self.h]) self.assertEqual(len(sgv.ops), 7)
def create_op_pruning_no_update( op: tf_compat.Operation, op_input: tf_compat.Tensor, ks_group: str, leave_enabled: bool = True, is_after_end_step: tf_compat.Tensor = None, ) -> PruningOpVars: """ Creates the necessary variables and operators to gradually apply sparsity to an operators variable without returning a PruningOpVars.update value. :param op: the operation to prune to the given sparsity :param op_input: the parameter within the op to create a mask for :param ks_group: the group identifier the scope should be created under mask_creator :param leave_enabled: True to continue masking the weights after end_epoch, False to stop masking :param is_after_end_step: only should be provided if leave_enabled is False; tensor that is true if the current global step is after end_epoch :return: a named tuple containing the assignment op, mask variable, threshold tensor, and masked tensor """ if tf_contrib_err: raise tf_contrib_err op_sgv = graph_editor.sgv(op) # create the necessary variables first with tf_compat.variable_scope(PruningScope.model(op, ks_group), reuse=tf_compat.AUTO_REUSE): mask = tf_compat.get_variable( PruningScope.VAR_MASK, op_input.get_shape(), initializer=tf_compat.ones_initializer(), trainable=False, dtype=op_input.dtype, ) tf_compat.add_to_collection( PruningScope.collection_name(ks_group, PruningScope.VAR_MASK), mask) # create the masked operation and assign as the new input to the op with tf_compat.name_scope( PruningScope.model(op, ks_group, trailing_slash=True)): masked = tf_compat.multiply(mask, op_input, PruningScope.OP_MASKED_VAR) op_inp_tens = (masked if leave_enabled else tf_compat.cond( is_after_end_step, lambda: op_input, lambda: masked)) op_swapped_inputs = [ inp if inp != op_input else op_inp_tens for inp in op_sgv.inputs ] graph_editor.swap_inputs(op, op_swapped_inputs) tf_compat.add_to_collection( PruningScope.collection_name(ks_group, PruningScope.OP_MASKED_VAR), masked) return PruningOpVars(op, op_input, None, mask, masked)
def test_subgraph_remap(self): sgv = ge.sgv(self.c.op) self.assertEqual(list(sgv.outputs), [self.c]) self.assertEqual(list(sgv.inputs), [self.a, self.b]) sgv = ge.sgv(self.c.op).remap([self.a], [0, self.c]) self.assertEqual(list(sgv.outputs), [self.c, self.c]) self.assertEqual(list(sgv.inputs), [self.a]) sgv = sgv.remap_outputs_to_consumers() self.assertEqual(list(sgv.outputs), [self.c, self.c, self.c]) sgv = sgv.remap_outputs_make_unique() self.assertEqual(list(sgv.outputs), [self.c]) sgv = sgv.remap(new_input_indices=[], new_output_indices=[]) self.assertEqual(len(sgv.inputs), 0) self.assertEqual(len(sgv.outputs), 0) sgv = sgv.remap_default() self.assertEqual(list(sgv.outputs), [self.c]) self.assertEqual(list(sgv.inputs), [self.a, self.b])
def test_reroute_can_modify(self): graph = tf.Graph() # create a special graph where "a" is an ambiguous tensor. That is # it is both an input and an output of the ops in sgv0. with graph.as_default(): a = tf.constant(1.0, shape=[2], name="a") b = tf.constant(2.0, shape=[2], name="b") c = tf.add(a, b, name="c") d = tf.add(a, c, name="d") e = tf.constant(1.0, shape=[2], name="e") f = tf.constant(2.0, shape=[2], name="f") g = tf.add(e, f, name="g") sgv0 = ge.sgv(a.op, b.op, c.op) sgv1 = ge.sgv(e.op, f.op) ge.reroute.swap_outputs(sgv0, sgv1) self.assertTrue(ge.matcher("g").input_ops("a", ge.matcher("c") .input_ops("a", "b"))(g.op)) self.assertTrue(ge.matcher("d").input_ops("e", "f")(d.op))
def _fuse_swapin_ops(self, src_op, swapout_op, bw_frontier_ops, ts0): """Fuse all swapin ops that swaps in the same tensor. This method does an in-place modification to the graph. Args: src_op: a `tf.Operation`. swapout_op: a `tf.Operation`. bw_frontier_ops: a set of `tf.Operation`. ts0: a `tf.Tensor`. Return: A set of `tf.Operation` that cannot be fused. """ fuse_bw_frontier_ops = { op for op in bw_frontier_ops if self._topo_sort.get_order(op) > 0} if len(fuse_bw_frontier_ops) >= 2: with tf.device(self._cpu_device): swap_in = tf.identity(ts0, name="lms/swapin") # Connect: swap_out -> swap_in self._connect_ops(swapout_op, swap_in.op) self._excl_ops.add(swap_in.op) # reuse swap_in tensors for op in fuse_bw_frontier_ops: # Connect: swap_in -> dest input_idx = ge.sgv( op, graph=self._graph).input_index(ts0) self._connect_ops(swap_in.op, op, remap_inputs=True, idx=input_idx) self._log_info( "{} (order {}) reuses tensor {}".format( op.name, self._topo_sort.get_order(op), ts0.name), 1) # control dependency -> swap_in min_order = self._topo_sort.size + 1 earliest_op = None for op in fuse_bw_frontier_ops: order = self._topo_sort.get_order(op) if order < min_order: min_order = order earliest_op = op if earliest_op: self._add_control_dependency(src_op, earliest_op, swap_in.op) bw_frontier_ops -= fuse_bw_frontier_ops return bw_frontier_ops
def convert_consts_to_var(graph, const_names_list): const_var_names_pairs = [] ops_to_delete = [] with graph.as_default(): var_list = [] for name in const_names_list: #tensor = graph.get_tensor_by_name('{}:0'.format(name)) tensor = graph.get_operation_by_name(name).outputs[0] with tf.Session() as sess: t_value = sess.run(tensor) t_name = '{}_const_var'.format(name) var = tf.Variable(t_value, name=t_name) const_var_names_pairs.append((name, t_name)) var_list.append(var) for const_name, var_name in const_var_names_pairs: const_op = graph.get_operation_by_name(const_name) var_op = graph.get_operation_by_name('{}/read'.format(var_name)) ge.swap_outputs(ge.sgv(const_op), ge.sgv(var_op)) ops_to_delete.append(const_op) tf.compat.v1.variables_initializer(var_list, 'init_constvars') return delete_nodes(graph, ops_to_delete)
def test_reroute_can_modify(self): graph = tf.Graph() # create a special graph where "a" is an ambiguous tensor. That is # it is both an input and an output of the ops in sgv0. with graph.as_default(): a = tf.constant(1.0, shape=[2], name="a") b = tf.constant(2.0, shape=[2], name="b") c = tf.add(a, b, name="c") d = tf.add(a, c, name="d") e = tf.constant(1.0, shape=[2], name="e") f = tf.constant(2.0, shape=[2], name="f") g = tf.add(e, f, name="g") sgv0 = ge.sgv(a.op, b.op, c.op) sgv1 = ge.sgv(e.op, f.op) ge.reroute.swap_outputs(sgv0, sgv1) self.assertTrue( ge.matcher("g").input_ops("a", ge.matcher("c").input_ops("a", "b"))(g.op)) self.assertTrue(ge.matcher("d").input_ops("e", "f")(d.op))
def recompute_tensor(target, known_values, preceding_op=None, copy_known_values=False): """Computes target tensor from known_values. If preceding_op is not None, adds necessary control dependencies such that newly created computation takes place after preceding_op. If copy_known_values is set, also copies known_values (for nicer graph visualization) """ assert is_computable(target, known_values) # position of target in parent op target_pos = list(target.op.outputs).index(target) if copy_known_values: computation = ge.get_backward_walk_ops(target) else: computation = ge.get_backward_walk_ops(target, stop_at_ts=known_values) # create copy of computation copied_sgv, info = ge.copy_with_input_replacements(ge.sgv(computation), {}) # find our target tensor in the new computation new_target_op = info._transformed_ops[target.op] new_target = new_target_op.outputs[target_pos] new_computation = list(info._transformed_ops.values()) # restrict computation to run after given op SAVE_ON_CONTROL_EDGES = True if SAVE_ON_CONTROL_EDGES: # only add "run_after" control dependencies to root of computation, # the rest automatically runs after because of data dependencies # TODO: more efficient implementation by walking back from new_target # instead of whole graph computation_graph = linearize_lib.get_graph(restrict_to=new_computation) # note, toposort order is reversed from networkx/mine convention computation_root = list(toposort.toposort(computation_graph))[-1] for op in computation_root: run_after(op, preceding_op) else: if preceding_op is not None: for op in info._transformed_ops.values(): run_after(op, preceding_op) return new_target
def _add_swapin(self, swapout_op, dest_op, ts0): """Add a swapin operation to the graph. The swapin ops reads the output tensor of `swapout_op` and passes it to `dest_op`, replacing the input tensor `ts0` of `dest_op`. This method does an in-place modification to the graph. Example: the graph before and after this method invoked. ``` Before |ts0| -> (swapout_op) |ts0| -> (dest_op) After: |ts0| -> (swapout_op) -> (swapin_op) -> (dest_op) ``` Args: swapout_op: a `tf.Operation` that swapped out the tensor `ts0`. dest_op: a `tf.Operation` that will consume the output tensor of `swapout_op`. ts0: a `tf.Tensor` being the original input tensor of `dest_op`. Return: A `tf.Operation` newly added to the graph. """ with tf.device(self._cpu_device): swap_in = tf.identity(ts0, name="lms/swapin") # Connect: swap_out -> swap_in self._connect_ops(swapout_op, swap_in.op) # Connect: swap_in -> dest dest_svg = ge.sgv(dest_op, graph=self._graph) input_idx = dest_svg.input_index(ts0) self._connect_ops(swap_in.op, dest_op, remap_inputs=True, idx=input_idx) self._excl_ops.add(swap_in.op) self._log_info( "Consuming op {} (order {}) swaps in {}".format( dest_op.name, self._topo_sort.get_order(dest_op), ts0.name), 1) return swap_in.op
def get_op_input_var( operation: tf_compat.Operation, var_index: Union[str, int] = VAR_INDEX_FROM_TRAINABLE, ) -> tf_compat.Tensor: """ Get the input variable for an operation. Ex: the weight for a conv operator. See @get_op_var_index for proper values for var_index. :param operation: the operation to get the input variable for :param var_index: the index to guide which input to grab from the operation :return: the tensor input that represents the variable input for the operation """ if tf_contrib_err: raise tf_contrib_err op_sgv = graph_editor.sgv(operation) var_index = get_op_var_index(var_index, op_sgv.inputs) return op_sgv.inputs[var_index]
def _add_swapout(self, src_op, ts0): """Add a swapout operation to the graph to swap out the output tensor `ts0` of the operation `src_op`. This method does an in-place modification to the graph. Example: the graph before and after this method invoked. ``` Before (src_op) -> |ts0| -> (dest_op) After: (src_op) -> |ts0| -> (swapout_op) |ts0| -> (dest_op) ``` Args: src_op: a `tf.Operation` that produces the tensor `ts0`. ts0: a output `tf.Tensor` of `src_op` being swapped out. Return: A `tf.Operation` newly added to the graph. """ with tf.device(self._cpu_device): swap_out = tf.identity(ts0, name="lms/swapout") # Connect: src-node -> swap-out src_svg = ge.sgv(src_op, graph=self._graph) src_out_idx = src_svg.output_index(ts0) self._connect_ops(src_op, swap_out.op, remap_outputs=True, idx=src_out_idx) self._excl_ops.add(swap_out.op) self._log_info( "Tensor {} will be placed on {}".format(ts0.name, self._cpu_device), 1) return swap_out.op
def gradients(ys, xs, grad_ys=None, checkpoints='collection', **kwargs): ''' Authors: Tim Salimans & Yaroslav Bulatov memory efficient gradient implementation inspired by "Training Deep Nets with Sublinear Memory Cost" by Chen et al. 2016 (https://arxiv.org/abs/1604.06174) ys,xs,grad_ys,kwargs are the arguments to standard tensorflow tf.gradients (https://www.tensorflow.org/versions/r0.12/api_docs/python/train.html#gradients) 'checkpoints' can either be - a list consisting of tensors from the forward pass of the neural net that we should re-use when calculating the gradients in the backward pass all other tensors that do not appear in this list will be re-computed - a string specifying how this list should be determined. currently we support - 'speed': checkpoint all outputs of convolutions and matmuls. these ops are usually the most expensive, so checkpointing them maximizes the running speed (this is a good option if nonlinearities, concats, batchnorms, etc are taking up a lot of memory) - 'memory': try to minimize the memory usage (currently using a very simple strategy that identifies a number of bottleneck tensors in the graph to checkpoint) - 'collection': look for a tensorflow collection named 'checkpoints', which holds the tensors to checkpoint ''' # print("Calling memsaving gradients with", checkpoints) if not isinstance(ys, list): ys = [ys] if not isinstance(xs, list): xs = [xs] bwd_ops = ge.get_backward_walk_ops([y.op for y in ys], inclusive=True) debug_print("bwd_ops: %s", bwd_ops) # forward ops are all ops that are candidates for recomputation fwd_ops = ge.get_forward_walk_ops([x.op for x in xs], inclusive=True, within_ops=bwd_ops) debug_print("fwd_ops: %s", fwd_ops) # exclude ops with no inputs fwd_ops = [op for op in fwd_ops if op.inputs] # don't recompute xs, remove variables xs_ops = _to_ops(xs) fwd_ops = [op for op in fwd_ops if not op in xs_ops] fwd_ops = [op for op in fwd_ops if not '/assign' in op.name] fwd_ops = [op for op in fwd_ops if not '/Assign' in op.name] fwd_ops = [op for op in fwd_ops if not '/read' in op.name] ts_all = ge.filter_ts(fwd_ops, True) # get the tensors ts_all = [t for t in ts_all if '/read' not in t.name] ts_all = set(ts_all) - set(xs) - set(ys) # construct list of tensors to checkpoint during forward pass, if not # given as input if type(checkpoints) is not list: if checkpoints == 'collection': checkpoints = tf.get_collection('checkpoints') elif checkpoints == 'speed': # checkpoint all expensive ops to maximize running speed checkpoints = ge.filter_ts_from_regex(fwd_ops, 'conv2d|Conv|MatMul') elif checkpoints == 'memory': # remove very small tensors and some weird ops def fixdims( t ): # tf.Dimension values are not compatible with int, convert manually try: return [int(e if e.value is not None else 64) for e in t] except: return [0] # unknown shape ts_all = [ t for t in ts_all if np.prod(fixdims(t.shape)) > MIN_CHECKPOINT_NODE_SIZE ] ts_all = [t for t in ts_all if 'L2Loss' not in t.name] ts_all = [t for t in ts_all if 'entropy' not in t.name] ts_all = [t for t in ts_all if 'FusedBatchNorm' not in t.name] ts_all = [t for t in ts_all if 'Switch' not in t.name] ts_all = [t for t in ts_all if 'dropout' not in t.name] # DV: FP16_FIX - need to add 'Cast' layer here to make it work for FP16 ts_all = [t for t in ts_all if 'Cast' not in t.name] # filter out all tensors that are inputs of the backward graph with util.capture_ops() as bwd_ops: tf_gradients(ys, xs, grad_ys, **kwargs) bwd_inputs = [t for op in bwd_ops for t in op.inputs] # list of tensors in forward graph that is in input to bwd graph ts_filtered = list(set(bwd_inputs).intersection(ts_all)) debug_print("Using tensors %s", ts_filtered) # try two slightly different ways of getting bottlenecks tensors # to checkpoint for ts in [ts_filtered, ts_all]: # get all bottlenecks in the graph bottleneck_ts = [] for t in ts: b = set( ge.get_backward_walk_ops(t.op, inclusive=True, within_ops=fwd_ops)) f = set( ge.get_forward_walk_ops(t.op, inclusive=False, within_ops=fwd_ops)) # check that there are not shortcuts b_inp = set([inp for op in b for inp in op.inputs]).intersection(ts_all) f_inp = set([inp for op in f for inp in op.inputs]).intersection(ts_all) if not set(b_inp).intersection( f_inp) and len(b_inp) + len(f_inp) >= len(ts_all): bottleneck_ts.append(t) # we have a bottleneck! else: debug_print( "Rejected bottleneck candidate and ops %s", [t] + list(set(ts_all) - set(b_inp) - set(f_inp))) # success? or try again without filtering? if len(bottleneck_ts) >= np.sqrt( len(ts_filtered)): # yes, enough bottlenecks found! break if not bottleneck_ts: raise Exception( 'unable to find bottleneck tensors! please provide checkpoint nodes manually, or use checkpoints="speed".' ) # sort the bottlenecks bottlenecks_sorted_lists = tf_toposort(bottleneck_ts, within_ops=fwd_ops) sorted_bottlenecks = [ t for ts in bottlenecks_sorted_lists for t in ts ] # save an approximately optimal number ~ sqrt(N) N = len(ts_filtered) if len(bottleneck_ts) <= np.ceil(np.sqrt(N)): checkpoints = sorted_bottlenecks else: step = int(np.ceil(len(bottleneck_ts) / np.sqrt(N))) checkpoints = sorted_bottlenecks[step::step] else: raise Exception('%s is unsupported input for "checkpoints"' % (checkpoints, )) checkpoints = list(set(checkpoints).intersection(ts_all)) # at this point automatic selection happened and checkpoints is list of nodes assert isinstance(checkpoints, list) debug_print("Checkpoint nodes used: %s", checkpoints) # better error handling of special cases # xs are already handled as checkpoint nodes, so no need to include them xs_intersect_checkpoints = set(xs).intersection(set(checkpoints)) if xs_intersect_checkpoints: debug_print("Warning, some input nodes are also checkpoint nodes: %s", xs_intersect_checkpoints) ys_intersect_checkpoints = set(ys).intersection(set(checkpoints)) debug_print("ys: %s, checkpoints: %s, intersect: %s", ys, checkpoints, ys_intersect_checkpoints) # saving an output node (ys) gives no benefit in memory while creating # new edge cases, exclude them if ys_intersect_checkpoints: debug_print( "Warning, some output nodes are also checkpoints nodes: %s", format_ops(ys_intersect_checkpoints)) # remove initial and terminal nodes from checkpoints list if present checkpoints = list(set(checkpoints) - set(ys) - set(xs)) # check that we have some nodes to checkpoint # if not checkpoints: # raise Exception('no checkpoints nodes found or given as input! ') # disconnect dependencies between checkpointed tensors checkpoints_disconnected = {} for x in checkpoints: if x.op and x.op.name is not None: grad_node = tf.stop_gradient(x, name=x.op.name + "_sg") else: grad_node = tf.stop_gradient(x) checkpoints_disconnected[x] = grad_node # partial derivatives to the checkpointed tensors and xs ops_to_copy = fast_backward_ops(seed_ops=[y.op for y in ys], stop_at_ts=checkpoints, within_ops=fwd_ops) debug_print("Found %s ops to copy within fwd_ops %s, seed %s, stop_at %s", len(ops_to_copy), fwd_ops, [r.op for r in ys], checkpoints) debug_print("ops_to_copy = %s", ops_to_copy) debug_print("Processing list %s", ys) copied_sgv, info = ge.copy_with_input_replacements(ge.sgv(ops_to_copy), {}) for origin_op, op in info._transformed_ops.items(): op._set_device(origin_op.node_def.device) copied_ops = info._transformed_ops.values() debug_print("Copied %s to %s", ops_to_copy, copied_ops) ge.reroute_ts(checkpoints_disconnected.values(), checkpoints_disconnected.keys(), can_modify=copied_ops) debug_print("Rewired %s in place of %s restricted to %s", checkpoints_disconnected.values(), checkpoints_disconnected.keys(), copied_ops) # get gradients with respect to current boundary + original x's copied_ys = [info._transformed_ops[y.op]._outputs[0] for y in ys] boundary = list(checkpoints_disconnected.values()) dv = tf_gradients(ys=copied_ys, xs=boundary + xs, grad_ys=grad_ys, **kwargs) debug_print("Got gradients %s", dv) debug_print("for %s", copied_ys) debug_print("with respect to %s", boundary + xs) inputs_to_do_before = [y.op for y in ys] if grad_ys is not None: inputs_to_do_before += grad_ys wait_to_do_ops = list(copied_ops) + [g.op for g in dv if g is not None] my_add_control_inputs(wait_to_do_ops, inputs_to_do_before) # partial derivatives to the checkpointed nodes # dictionary of "node: backprop" for nodes in the boundary d_checkpoints = { r: dr for r, dr in zip(checkpoints_disconnected.keys(), dv[:len(checkpoints_disconnected)]) } # partial derivatives to xs (usually the params of the neural net) d_xs = dv[len(checkpoints_disconnected):] # incorporate derivatives flowing through the checkpointed nodes checkpoints_sorted_lists = tf_toposort(checkpoints, within_ops=fwd_ops) for ts in checkpoints_sorted_lists[::-1]: debug_print("Processing list %s", ts) checkpoints_other = [r for r in checkpoints if r not in ts] checkpoints_disconnected_other = [ checkpoints_disconnected[r] for r in checkpoints_other ] # copy part of the graph below current checkpoint node, stopping at # other checkpoints nodes ops_to_copy = fast_backward_ops(within_ops=fwd_ops, seed_ops=[r.op for r in ts], stop_at_ts=checkpoints_other) debug_print("Found %s ops to copy within %s, seed %s, stop_at %s", len(ops_to_copy), fwd_ops, [r.op for r in ts], checkpoints_other) debug_print("ops_to_copy = %s", ops_to_copy) if not ops_to_copy: # we're done! break copied_sgv, info = ge.copy_with_input_replacements( ge.sgv(ops_to_copy), {}) for origin_op, op in info._transformed_ops.items(): op._set_device(origin_op.node_def.device) copied_ops = info._transformed_ops.values() debug_print("Copied %s to %s", ops_to_copy, copied_ops) ge.reroute_ts(checkpoints_disconnected_other, checkpoints_other, can_modify=copied_ops) debug_print("Rewired %s in place of %s restricted to %s", checkpoints_disconnected_other, checkpoints_other, copied_ops) # gradient flowing through the checkpointed node boundary = [info._transformed_ops[r.op]._outputs[0] for r in ts] substitute_backprops = [d_checkpoints[r] for r in ts] dv = tf_gradients(boundary, checkpoints_disconnected_other + xs, grad_ys=substitute_backprops, **kwargs) debug_print("Got gradients %s", dv) debug_print("for %s", boundary) debug_print("with respect to %s", checkpoints_disconnected_other + xs) debug_print("with boundary backprop substitutions %s", substitute_backprops) inputs_to_do_before = [d_checkpoints[r].op for r in ts] wait_to_do_ops = list(copied_ops) + [g.op for g in dv if g is not None] my_add_control_inputs(wait_to_do_ops, inputs_to_do_before) # partial derivatives to the checkpointed nodes for r, dr in zip(checkpoints_other, dv[:len(checkpoints_other)]): if dr is not None: if d_checkpoints[r] is None: d_checkpoints[r] = dr else: d_checkpoints[r] += dr def _unsparsify(x): if not isinstance(x, tf.IndexedSlices): return x assert x.dense_shape is not None, "memory_saving_gradients encountered sparse gradients of unknown shape" indices = x.indices while indices.shape.ndims < x.values.shape.ndims: indices = tf.expand_dims(indices, -1) return tf.scatter_nd(indices, x.values, x.dense_shape) # partial derivatives to xs (usually the params of the neural net) d_xs_new = dv[len(checkpoints_other):] for j in range(len(xs)): if d_xs_new[j] is not None: if d_xs[j] is None: d_xs[j] = _unsparsify(d_xs_new[j]) else: d_xs[j] += _unsparsify(d_xs_new[j]) return d_xs
# Select the correct op from the forward walk to connect to for fw_op in fw_ops: if fw_op.type not in not_types: next_op = fw_op break if next_op is None: raise ValueError('No suitable next op found to connect to. Try looking at the graph or full list of forward ops') # Add placeholder and variable add_op = tf.add(var, delta_placeholder) # TODO - might be neater if these were created in the same scope as the variable; also might solve issue with connecting add ops within while loop # Connect add_op output to next op input # Create subgraph 1 (outputs) sgv0 = ge.sgv(add_op.op) # Create subgraph 2 (inputs) sgv1 = ge.sgv(next_op).remap_inputs([1]) # Connect ge.connect(sgv0, sgv1) # TODO - sort out error with tf.while loops; may not be possible: try the assign_add method first # Define parameter update ops update = 0.01*delta_placeholder new_update_op = tf.assign_sub(var, update) if gamma_update_op is not None: gamma_update_op = tf.group(new_update_op, gamma_update_op) else: gamma_update_op = new_update_op
def gradients(ys, xs, # pylint: disable: too-many-statements, too-many-branches grad_ys=None, checkpoints='collection', **kwargs): ''' Authors: Tim Salimans & Yaroslav Bulatov memory efficient gradient implementation inspired by "Training Deep Nets with Sublinear Memory Cost" by Chen et al. 2016 (https://arxiv.org/abs/1604.06174) ys,xs,grad_ys,kwargs are the arguments to standard tensorflow tf.gradients (https://www.tensorflow.org/versions/r0.12/api_docs/python/train.html#gradients) 'checkpoints' can either be - a list consisting of tensors from the forward pass of the neural net that we should re-use when calculating the gradients in the backward pass all other tensors that do not appear in this list will be re-computed - a string specifying how this list should be determined. currently we support - 'speed': checkpoint all outputs of convolutions and matmuls. these ops are usually the most expensive, so checkpointing them maximizes the running speed (this is a good option if nonlinearities, concats, batchnorms, etc are taking up a lot of memory) - 'memory': try to minimize the memory usage (currently using a very simple strategy that identifies a number of bottleneck tensors in the graph to checkpoint) - 'collection': look for a tensorflow collection named 'checkpoints', which holds the tensors to checkpoint ''' # print("Calling memsaving gradients with", checkpoints) if not isinstance(ys, list): ys = [ys] if not isinstance(xs, list): xs = [xs] bwd_ops = ge.get_backward_walk_ops([y.op for y in ys], inclusive=True) debug_print("bwd_ops: {}".format(bwd_ops)) # forward ops are all ops that are candidates for recomputation fwd_ops = ge.get_forward_walk_ops([x.op for x in xs], inclusive=True, within_ops=bwd_ops) debug_print("fwd_ops: {}".format(fwd_ops)) # exclude ops with no inputs fwd_ops = [op for op in fwd_ops if op.inputs] # don't recompute xs, remove variables xs_ops = _to_ops(xs) fwd_ops = [op for op in fwd_ops if op not in xs_ops] fwd_ops = [op for op in fwd_ops if '/assign' not in op.name] fwd_ops = [op for op in fwd_ops if '/Assign' not in op.name] fwd_ops = [op for op in fwd_ops if '/read' not in op.name] ts_all = ge.filter_ts(fwd_ops, True) # get the tensors ts_all = [t for t in ts_all if '/read' not in t.name] ts_all = set(ts_all) - set(xs) - set(ys) # construct list of tensors to checkpoint during forward pass, if not # given as input if type(checkpoints) is not list: if checkpoints == 'collection': checkpoints = tf.get_collection('checkpoints') elif checkpoints == 'speed': # checkpoint all expensive ops to maximize running speed checkpoints = ge.filter_ts_from_regex(fwd_ops, 'conv2d|Conv|MatMul') elif checkpoints == 'memory': # remove very small tensors and some weird ops def fixdims(t): # tf.Dimension values are not compatible with int, convert manually try: return [int(e if e.value is not None else 64) for e in t] except: return [0] # unknown shape ts_all = [t for t in ts_all if np.prod(fixdims(t.shape)) > MIN_CHECKPOINT_NODE_SIZE] ts_all = [t for t in ts_all if 'L2Loss' not in t.name] ts_all = [t for t in ts_all if 'entropy' not in t.name] ts_all = [t for t in ts_all if 'FusedBatchNorm' not in t.name] ts_all = [t for t in ts_all if 'Switch' not in t.name] ts_all = [t for t in ts_all if 'dropout' not in t.name] # DV: FP16_FIX - need to add 'Cast' layer here to make it work for FP16 ts_all = [t for t in ts_all if 'Cast' not in t.name] # filter out all tensors that are inputs of the backward graph with util.capture_ops() as bwd_ops: tf_gradients(ys, xs, grad_ys, **kwargs) bwd_inputs = [t for op in bwd_ops for t in op.inputs] # list of tensors in forward graph that is in input to bwd graph ts_filtered = list(set(bwd_inputs).intersection(ts_all)) debug_print("Using tensors {}".format(ts_filtered)) # try two slightly different ways of getting bottlenecks tensors # to checkpoint for ts in [ts_filtered, ts_all]: # get all bottlenecks in the graph bottleneck_ts = [] for t in ts: b = set(ge.get_backward_walk_ops(t.op, inclusive=True, within_ops=fwd_ops)) f = set(ge.get_forward_walk_ops(t.op, inclusive=False, within_ops=fwd_ops)) # check that there are not shortcuts b_inp = set([inp for op in b for inp in op.inputs]).intersection(ts_all) f_inp = set([inp for op in f for inp in op.inputs]).intersection(ts_all) if not set(b_inp).intersection(f_inp) and len(b_inp)+len(f_inp) >= len(ts_all): bottleneck_ts.append(t) # we have a bottleneck! else: debug_print("Rejected bottleneck candidate and ops {}".format( [t] + list(set(ts_all) - set(b_inp) - set(f_inp)))) # success? or try again without filtering? if len(bottleneck_ts) >= np.sqrt(len(ts_filtered)): # enough bottlenecks found! break if not bottleneck_ts: raise Exception('unable to find bottleneck tensors! please provide checkpoint ' 'nodes manually, or use checkpoints="speed".') # sort the bottlenecks bottlenecks_sorted_lists = tf_toposort(bottleneck_ts, within_ops=fwd_ops) sorted_bottlenecks = [t for ts in bottlenecks_sorted_lists for t in ts] # save an approximately optimal number ~ sqrt(N) N = len(ts_filtered) if len(bottleneck_ts) <= np.ceil(np.sqrt(N)): checkpoints = sorted_bottlenecks else: step = int(np.ceil(len(bottleneck_ts) / np.sqrt(N))) checkpoints = sorted_bottlenecks[step::step] else: raise Exception('%s is unsupported input for "checkpoints"' % (checkpoints,)) checkpoints = list(set(checkpoints).intersection(ts_all)) # at this point automatic selection happened and checkpoints is list of nodes assert isinstance(checkpoints, list) debug_print("Checkpoint nodes used: {}".format(checkpoints)) # better error handling of special cases # xs are already handled as checkpoint nodes, so no need to include them xs_intersect_checkpoints = set(xs).intersection(set(checkpoints)) if xs_intersect_checkpoints: debug_print("Warning, some input nodes are also checkpoint nodes: {}".format( xs_intersect_checkpoints)) ys_intersect_checkpoints = set(ys).intersection(set(checkpoints)) debug_print("ys: {}, checkpoints:{}, intersect: {}".format( ys, checkpoints, ys_intersect_checkpoints)) # saving an output node (ys) gives no benefit in memory while creating # new edge cases, exclude them if ys_intersect_checkpoints: debug_print("Warning, some output nodes are also checkpoints nodes: {}".format( format_ops(ys_intersect_checkpoints))) # remove initial and terminal nodes from checkpoints list if present checkpoints = list(set(checkpoints) - set(ys) - set(xs)) # check that we have some nodes to checkpoint if not checkpoints: raise Exception('no checkpoints nodes found or given as input! ') # disconnect dependencies between checkpointed tensors checkpoints_disconnected = {} for x in checkpoints: if x.op and x.op.name is not None: grad_node = tf.stop_gradient(x, name=x.op.name+"_sg") else: grad_node = tf.stop_gradient(x) checkpoints_disconnected[x] = grad_node # partial derivatives to the checkpointed tensors and xs ops_to_copy = fast_backward_ops(seed_ops=[y.op for y in ys], stop_at_ts=checkpoints, within_ops=fwd_ops) debug_print("Found {} ops to copy within fwd_ops {}, seed {}, stop_at {}".format( len(ops_to_copy), fwd_ops, [r.op for r in ys], checkpoints)) debug_print("ops_to_copy = {}".format(ops_to_copy)) debug_print("Processing list {}".format(ys)) _, info = ge.copy_with_input_replacements(ge.sgv(ops_to_copy), {}) for origin_op, op in info._transformed_ops.items(): op._set_device(origin_op.node_def.device) copied_ops = info._transformed_ops.values() debug_print("Copied {} to {}".format(ops_to_copy, copied_ops)) ge.reroute_ts(checkpoints_disconnected.values(), checkpoints_disconnected.keys(), can_modify=copied_ops) debug_print("Rewired {} in place of {} restricted to {}".format( checkpoints_disconnected.values(), checkpoints_disconnected.keys(), copied_ops)) # get gradients with respect to current boundary + original x's copied_ys = [info._transformed_ops[y.op]._outputs[0] for y in ys] boundary = list(checkpoints_disconnected.values()) dv = tf_gradients(ys=copied_ys, xs=boundary+xs, grad_ys=grad_ys, **kwargs) debug_print("Got gradients {}".format(dv)) debug_print("for %s", copied_ys) debug_print("with respect to {}".format(boundary+xs)) inputs_to_do_before = [y.op for y in ys] if grad_ys is not None: inputs_to_do_before += grad_ys wait_to_do_ops = list(copied_ops) + [g.op for g in dv if g is not None] my_add_control_inputs(wait_to_do_ops, inputs_to_do_before) # partial derivatives to the checkpointed nodes # dictionary of "node: backprop" for nodes in the boundary d_checkpoints = {r: dr for r, dr in zip(checkpoints_disconnected.keys(), dv[:len(checkpoints_disconnected)])} # partial derivatives to xs (usually the params of the neural net) d_xs = dv[len(checkpoints_disconnected):] # incorporate derivatives flowing through the checkpointed nodes checkpoints_sorted_lists = tf_toposort(checkpoints, within_ops=fwd_ops) for ts in checkpoints_sorted_lists[::-1]: debug_print("Processing list {}".format(ts)) checkpoints_other = [r for r in checkpoints if r not in ts] checkpoints_disconnected_other = [checkpoints_disconnected[r] for r in checkpoints_other] # copy part of the graph below current checkpoint node, stopping at # other checkpoints nodes ops_to_copy = fast_backward_ops(within_ops=fwd_ops, seed_ops=[r.op for r in ts], stop_at_ts=checkpoints_other) debug_print("Found {} ops to copy within {}, seed {}, stop_at {}".format( len(ops_to_copy), fwd_ops, [r.op for r in ts], checkpoints_other)) debug_print("ops_to_copy = {}".format(ops_to_copy)) if not ops_to_copy: # we're done! break _, info = ge.copy_with_input_replacements(ge.sgv(ops_to_copy), {}) for origin_op, op in info._transformed_ops.items(): op._set_device(origin_op.node_def.device) copied_ops = info._transformed_ops.values() debug_print("Copied {} to {}".format(ops_to_copy, copied_ops)) ge.reroute_ts(checkpoints_disconnected_other, checkpoints_other, can_modify=copied_ops) debug_print("Rewired %s in place of %s restricted to %s", checkpoints_disconnected_other, checkpoints_other, copied_ops) # gradient flowing through the checkpointed node boundary = [info._transformed_ops[r.op]._outputs[0] for r in ts] substitute_backprops = [d_checkpoints[r] for r in ts] dv = tf_gradients(boundary, checkpoints_disconnected_other+xs, grad_ys=substitute_backprops, **kwargs) debug_print("Got gradients {}".format(dv)) debug_print("for {}".format(boundary)) debug_print("with respect to {}".format(checkpoints_disconnected_other+xs)) debug_print("with boundary backprop substitutions {}".format(substitute_backprops)) inputs_to_do_before = [d_checkpoints[r].op for r in ts] wait_to_do_ops = list(copied_ops) + [g.op for g in dv if g is not None] my_add_control_inputs(wait_to_do_ops, inputs_to_do_before) # partial derivatives to the checkpointed nodes for r, dr in zip(checkpoints_other, dv[:len(checkpoints_other)]): if dr is not None: if d_checkpoints[r] is None: d_checkpoints[r] = dr else: d_checkpoints[r] += dr def _unsparsify(var_x): if not isinstance(var_x, tf.IndexedSlices): return var_x assert var_x.dense_shape is not None, \ "memory_saving_gradients encountered sparse gradients of unknown shape" indices = var_x.indices while indices.shape.ndims < var_x.values.shape.ndims: indices = tf.expand_dims(indices, -1) return tf.scatter_nd(indices, var_x.values, var_x.dense_shape) # partial derivatives to xs (usually the params of the neural net) d_xs_new = dv[len(checkpoints_other):] for j in range(len(xs)): if d_xs_new[j] is not None: if d_xs[j] is None: d_xs[j] = _unsparsify(d_xs_new[j]) else: d_xs[j] += _unsparsify(d_xs_new[j]) return d_xs
def apply(self, new_inputs, update_colocation_groups=True): assert len(new_inputs) == len(self.inputs) g = tf.get_default_graph() # todo: make that member variable new_inputs2 = [] # replace variable inputs with their read endpoint for input in new_inputs: if isinstance(input, tf.Variable): new_inputs2.append(input.read_value()) else: new_inputs2.append(input) new_inputs = new_inputs2 replacements = OrderedDict() for old_input_ts, new_input_ts in zip(self.inputs, new_inputs): # shape/dtype checks if isinstance(old_input_ts, (list, tuple)): reference_ts = old_input_ts[0] else: reference_ts = old_input_ts assert reference_ts.get_shape() == new_input_ts.get_shape() assert reference_ts.dtype == new_input_ts.dtype # Variable with multiple read endpoints, replace all of them with # new input tensor if isinstance(old_input_ts, (list, tuple)): for sub_input in old_input_ts: replacements[sub_input] = new_input_ts # regular Tensor else: replacements[old_input_ts] = new_input_ts # sanity checks # copying Variables is confusing because they don't get added # to GLOBAL_VARIABLES collection hence escape global initialization # therefore forbit it for op in self.ops: if op.type.startswith('Variable'): # 'VariableV2' or 'Variable' assert False, "Can't copy variables" # TODO: remove this def summarize_ts(ts): from collections import Counter type_counter = Counter() ops = set([tensor.op for tensor in ts]) print Counter([op.type for op in ops]).most_common(10) sgv = ge.sgv(self.ops) # import pdb; pdb.set_trace() copied_sgv, info = ge.copy_with_input_replacements(sgv, replacements) # converting between Python bytes and unicode def to_bytes(s): return s.encode('ascii') def from_bytes(s): return s.decode('ascii') # fix colocation constraints to point to copied ops # see https://github.com/tensorflow/tensorflow/issues/9925 if update_colocation_groups: new_ops = [info._transformed_ops[op] for op in self.ops] for new_op in new_ops: assert len(new_op.colocation_groups()) == 1 colocation_group = new_op.colocation_groups()[0] assert colocation_group.startswith(b'loc:@') colocated_with_name = from_bytes(colocation_group[len(b'loc:@'):]) # if there were no colocation constraints, the op gets colocated with # itself (default colocation group), ignore that constraint if colocated_with_name == new_op.name: continue colocation_op = g.get_operation_by_name(colocated_with_name) if colocation_op in info._transformed_ops: new_colocation_op = info._transformed_ops[colocation_op] else: assert colocation_op in self.input_ops colocation_op_idx = self.input_ops.index(colocation_op) new_colocation_op = new_inputs[colocation_op_idx].op # overwrite existing _class attribute with new colocation constraints new_colocation_groups = [b'loc:@'+to_bytes(new_colocation_op.name)] new_op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue( list=attr_value_pb2.AttrValue.ListValue(s=new_colocation_groups))) # place new ops on device from current device context device = get_current_device() if device: for op in info._transformed_ops.values(): op._set_device(device) new_outputs = [] for old_output_ts in self.outputs: new_output_op = info._transformed_ops[old_output_ts.op] new_output_ts = new_output_op.outputs[0] new_outputs.append(new_output_ts) return new_outputs
def get_ops_and_inputs_by_name_or_regex( var_names: List[str], graph: tf_compat.Graph = None, ) -> List[Tuple[tf_compat.Operation, tf_compat.Tensor]]: """ Get tuples of operations and the inputs for inputs of operations that match a regex pattern in the list params. :param var_names: List of full names or regex patterns to match variable names by. :param graph: the graph to get the prunable operations from. If not supplied, then will use the default graph :return: a list of (operation, parameter) pairs for parameters that match a regex pattern in var_names. If the wildcards '.' or '.*' are provided as regex patterns, then will match on all prunable layers and return variables using get_op_input_var """ if tf_contrib_err: raise tf_contrib_err if not graph: graph = tf_compat.get_default_graph() prunable_ops_and_inputs = [] if "re:.*" in var_names or "re:." in var_names: # wildcard cases ops = get_prunable_ops(graph) for _, op in ops: prunable_ops_and_inputs.append((op, get_op_input_var(op))) else: for var in tf_compat.global_variables(): if any_str_or_regex_matches_tensor_name(var.name, var_names): var_tens = graph.get_tensor_by_name(var.name) # get all the read ops for the var read_ops = [ read_op for read_op in graph_editor.get_consuming_ops(var_tens) if "/read" == read_op.name[-5:] ] # filter for /read ops read_tensors = { read_tensor for read_op in read_ops for read_tensor in graph_editor.sgv(read_op).outputs } # gets ops that read from read_tensors and filters any ops # that were created by mask_ks consuming_ops_with_input = [ (consuming_op, read_tensor) for read_tensor in read_tensors for consuming_op in graph_editor.get_consuming_ops(read_tensor) ] for op, inp in consuming_ops_with_input: if "_nm_ks" not in op.name: prunable_ops_and_inputs.append((op, inp)) else: nm_ks_consuming_ops_with_input = [ (consuming_op, inp) for output_tens in graph_editor.sgv(op).outputs for consuming_op in graph_editor.get_consuming_ops( output_tens ) if "_nm_ks" not in consuming_op.name ] prunable_ops_and_inputs += nm_ks_consuming_ops_with_input # Check that all var_names values have a match _validate_all_params_found(var_names, prunable_ops_and_inputs) return prunable_ops_and_inputs
fact_levels = [32, 30, 25, 20, 15, 10, 5] for lvl in fact_levels: logEntry(lvl) # load the orignal graph graph = load_graph("../model/pb_files/rnn-deep-250-32-2.pb") W1 = graph.get_tensor_by_name('prefix/w_in:0') matmul = graph.get_tensor_by_name('prefix/MatMul:0') bias = graph.get_tensor_by_name('prefix/b_in:0') add = graph.get_tensor_by_name('prefix/add:0') reshape = graph.get_tensor_by_name('prefix/Reshape:0') # #remove all conncetions from matmul ge.detach(ge.sgv(matmul.op)) with tf.Session(graph=graph) as sess: os.system("mkdir /flush1/raj034/RNN/model/test_cases/" + breath + "/" + quant + "/fact_" + str(lvl)) # for op in sess.graph.get_operations(): # print(op.name) W = W1.eval() u, s, v, ss = svd_compress_gs(W, lvl) logEntry("structural_similarity == > " + str(ss)) u1 = tf.matmul(reshape, u, name="prefix/u1") s1 = tf.matmul(u1, s, name="prefix/s1") v1 = tf.matmul(s1, v, name="prefix/v1") ge.connect(ge.sgv(v1.op), ge.sgv(add.op).remap_inputs([0]))
def minimize(self, loss, var_list=None, global_step=None): orig_graph_view = None trainable_vars = var_list if var_list != None else tf.trainable_variables( ) if self.inputs is not None: seed_ops = [t.op for t in self.inputs] result = list(seed_ops) wave = set(seed_ops) while wave: # stolen from grap_editor.select new_wave = set() for op in wave: for new_t in op.outputs: if new_t == loss: continue for new_op in new_t.consumers(): #if new_op not in result and is_within(new_op): if new_op not in result: new_wave.add(new_op) for op in new_wave: if op not in result: result.append(op) wave = new_wave orig_graph_view = ge.sgv(result) else: orig_graph_view = ge.sgv(self.work_graph) self.global_step_tensor = tf.Variable( 0, name='global_step', trainable=False) if global_step is None else global_step # Perturbations deltas = {} n_perturbations = {} p_perturbations = {} with tf.name_scope("Perturbator"): self.c_t = tf.div( self.c, tf.pow( tf.add(tf.cast(self.global_step_tensor, tf.float32), tf.constant(1, dtype=tf.float32)), self.gamma), name="SPSA_ct") # self.c_t = 0.00 #MOD for var in trainable_vars: self.num_params += self._mul_dims(var.get_shape()) var_name = var.name.split(':')[0] random = Bernoulli(tf.fill(var.get_shape(), 0.5), dtype=tf.float32) deltas[var] = tf.subtract(tf.constant(1, dtype=tf.float32), tf.scalar_mul( tf.constant(2, dtype=tf.float32), random.sample(1)[0]), name="SPSA_delta") c_t_delta = tf.scalar_mul(tf.reshape(self.c_t, []), deltas[var]) n_perturbations[var_name + '/read:0'] = tf.subtract( var, c_t_delta, name="perturb_n") p_perturbations[var_name + '/read:0'] = tf.add( var, c_t_delta, name="perturb_p") # print("{} parameters".format(self.num_params)) # Evaluator with tf.name_scope("Evaluator"): _, self.ninfo = self._clone_model(orig_graph_view, n_perturbations, 'N_Eval') _, self.pinfo = self._clone_model(orig_graph_view, p_perturbations, 'P_Eval') # Weight Updater optimizer_ops = [] with tf.control_dependencies([loss]): with tf.name_scope('Updater'): a_t = self.a / (tf.pow( tf.add(tf.cast(self.global_step_tensor, tf.float32), tf.constant(1, dtype=tf.float32)), self.alpha)) # a_t = 0.00 #MOD for var in trainable_vars: l_pos = self.pinfo.transformed(loss) l_neg = self.ninfo.transformed(loss) # print( "l_pos: ", l_pos) # print( "l_neg: ", l_neg) ghat = (l_pos - l_neg) / (tf.constant(2, dtype=tf.float32) * self.c_t * deltas[var]) optimizer_ops.append(tf.assign_sub(var, a_t * ghat)) grp = control_flow_ops.group(*optimizer_ops) with tf.control_dependencies([grp]): tf.assign_add(self.global_step_tensor, tf.constant(1, dtype=self.global_step_tensor.dtype)) return grp
def test_bypass(self): """Test for ge.bypass.""" ge.bypass(ge.sgv(self.f.op).remap_inputs([0])) self.assertTrue(ge.matcher("^foo/bar/h$").input_ops("^foo/c$", "foo/bar/g$") (self.h.op))