def testGradientAtZero(self): with self.test_session(): logits = constant_op.constant([0.0, 0.0], dtype=dtypes.float64) targets = constant_op.constant([0.0, 1.0], dtype=dtypes.float64) loss = nn_impl.sigmoid_cross_entropy_with_logits(logits, targets) grads = gradients_impl.gradients(loss, logits)[0].eval() self.assertAllClose(grads, [0.5, -0.5])
def testGradientAtZero(self): with self.test_session(): logits = constant_op.constant([0.0, 0.0], dtype=dtypes.float64) targets = constant_op.constant([0.0, 1.0], dtype=dtypes.float64) loss = nn_impl.sigmoid_cross_entropy_with_logits(logits, targets) grads = gradients_impl.gradients(loss, logits)[0].eval() self.assertAllClose(grads, [0.5, -0.5])
def testConstructionNamed(self): with self.test_session(): logits, targets, _ = self._Inputs() loss = nn_impl.sigmoid_cross_entropy_with_logits(logits, targets, name="mylogistic") self.assertEqual("mylogistic", loss.op.name)
def binary_cross_entropy(labels, logits, name=None): """ Computes the binary cross entropy between the labels and logits This is a safe version that adds epsilon to logits to prevent log(0) """ return nn_impl.sigmoid_cross_entropy_with_logits(logits=ensure_finite(logits), labels=labels, name=name)
def testGradient(self): sizes = [4, 2] with self.test_session(): logits, targets, _ = self._Inputs(sizes=sizes) loss = nn_impl.sigmoid_cross_entropy_with_logits(logits, targets) err = gradient_checker.compute_gradient_error(logits, sizes, loss, sizes) print("logistic loss gradient err = ", err) self.assertLess(err, 1e-7)
def testLogisticOutputMultiDim(self): for use_gpu in [True, False]: for dtype in [dtypes.float32, dtypes.float16]: with self.test_session(use_gpu=use_gpu): logits, targets, losses = self._Inputs(dtype=dtype, sizes=[2, 2, 2]) loss = nn_impl.sigmoid_cross_entropy_with_logits(logits, targets) np_loss = np.array(losses).astype(np.float32) tf_loss = loss.eval() self.assertAllClose(np_loss, tf_loss, atol=0.001)
def testGradient(self): sizes = [4, 2] with self.cached_session(): logits, targets, _ = self._Inputs(sizes=sizes) loss = nn_impl.sigmoid_cross_entropy_with_logits( labels=targets, logits=logits) err = gradient_checker.compute_gradient_error(logits, sizes, loss, sizes) print("logistic loss gradient err = ", err) self.assertLess(err, 1e-7)
def testLogisticOutput(self): for use_gpu in [True, False]: for dtype in [dtypes.float32, dtypes.float16]: with self.cached_session(use_gpu=use_gpu): logits, targets, losses = self._Inputs(dtype=dtype) loss = nn_impl.sigmoid_cross_entropy_with_logits( labels=targets, logits=logits) np_loss = np.array(losses).astype(np.float32) tf_loss = self.evaluate(loss) self.assertAllClose(np_loss, tf_loss, atol=0.001)
def testLogisticOutputMultiDim(self): for use_gpu in [True, False]: for dtype in [dtypes.float32, dtypes.float16]: with self.test_session(use_gpu=use_gpu): logits, targets, losses = self._Inputs(dtype=dtype, sizes=[2, 2, 2]) loss = nn_impl.sigmoid_cross_entropy_with_logits( labels=targets, logits=logits) np_loss = np.array(losses).astype(np.float32) tf_loss = loss.eval() self.assertAllClose(np_loss, tf_loss, atol=0.001)
def elementwise_loss(labels, logits, mask): return sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) * mask
def testConstructionNamed(self): with self.cached_session(): logits, targets, _ = self._Inputs() loss = nn_impl.sigmoid_cross_entropy_with_logits( labels=targets, logits=logits, name="mylogistic") self.assertEqual("mylogistic", loss.op.name)
def testShapeError(self): with self.assertRaisesRegexp(ValueError, "must have the same shape"): nn_impl.sigmoid_cross_entropy_with_logits(labels=[1, 2, 3], logits=[[2, 1]])
def build_distributed_graph(): batch_size = 4 shape_0 = [batch_size, 5] shape_1 = [batch_size, 6] maxval = int(0x7FFF) server0 = server_lib.Server.create_local_server() server1 = server_lib.Server.create_local_server() cluster_def = cluster_pb2.ClusterDef() job = cluster_def.job.add() job.name = 'worker' job.tasks[0] = server0.target[len('grpc://'):] job.tasks[1] = server1.target[len('grpc://'):] config = config_pb2.ConfigProto( cluster_def=cluster_def, experimental=config_pb2.ConfigProto.Experimental( share_session_state_in_clusterspec_propagation=True, ), ) config.allow_soft_placement = False with ops.device('/job:worker/task:0'): feat_0 = random_ops.random_uniform(shape_0, maxval=maxval, dtype=dtypes.int64) feat_0 = array_ops.reshape(feat_0, (-1, )) feat_1 = random_ops.random_uniform(shape_1, maxval=maxval, dtype=dtypes.int64) feat_1 = array_ops.reshape(feat_1, (-1, )) var_0 = deo.get_variable( name='sp_var_0', devices=[ '/job:worker/task:1', ], initializer=init_ops.random_normal_initializer(0, 0.005), ) var_1 = deo.get_variable( name='sp_var_1', devices=[ '/job:worker/task:1', ], initializer=init_ops.random_normal_initializer(0, 0.005), ) var_list = [var_0, var_1] _, tw_0 = deo.embedding_lookup( params=var_0, ids=feat_0, name='sp_emb_0', return_trainable=True, ) _, tw_1 = deo.embedding_lookup( params=var_1, ids=feat_1, name='sp_emb_1', return_trainable=True, ) collapse_0 = array_ops.reshape(tw_0, (batch_size, -1)) collapse_1 = array_ops.reshape(tw_1, (batch_size, -1)) logits_0 = math_ops.reduce_sum(collapse_0, axis=1) logits_1 = math_ops.reduce_sum(collapse_1, axis=1) logits = math_ops.add(logits_0, logits_1) labels = array_ops.zeros((batch_size, ), dtype=dtypes.float32) loss = math_ops.reduce_mean( nn_impl.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, )) optimizers = get_multiple_optimizers() return server0, server1, config, var_list, optimizers, loss
def common_run_context(self, var_list, opt_list, name): save_dir = os.path.join(self.get_temp_dir(), 'save_restore') save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), 'restrict') batch_size = 2 sample_length = 3 emb_domain_list = list() tws = list() for _v in var_list: ids = random_ops.random_uniform((batch_size, sample_length), maxval=1000000, dtype=_v.key_dtype) ids = array_ops.reshape(ids, (-1, )) _, tw = deo.embedding_lookup(_v, ids, return_trainable=True) tws.append(tw) _collapse = array_ops.reshape(tw, (batch_size, -1)) _logits = math_ops.reduce_sum(_collapse, axis=1) _logits = math_ops.cast(_logits, dtypes.float32) emb_domain_list.append(_logits) logits = math_ops.add_n(emb_domain_list) labels = array_ops.zeros((batch_size, ), dtype=dtypes.float32) loss = math_ops.reduce_mean( nn_impl.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, )) _train_ops = list() for _opt in opt_list: _train_ops.append(_opt.minimize(loss)) train_op = control_flow_ops.group(_train_ops) restrictor = dvr.VariableRestrictor(var_list=var_list, optimizer_list=opt_list) policies = list(itertools.chain(*restrictor.policy_group.values())) tstp_vars = [policy.tstp_var for policy in policies] slot_vars = list() for tw in tws: for opt in opt_list: slot_vars += select_slot_vars(tw, opt) update_op = restrictor.update() threshold = int(batch_size * sample_length * 1.5) factor = 1.2 restrict_op = restrictor.restrict(threshold=threshold, factor=factor) saver = saver_lib.Saver() with self.session(config=default_config, use_gpu=test_util.is_gpu_available()) as sess: self.evaluate(variables.global_variables_initializer()) n, MAX_ITER = 0, 1000 while n < MAX_ITER: sess.run([train_op, update_op]) if all( self.evaluate(var.size()) > threshold * factor for var in var_list): break rt_save_path = saver.save(sess, save_path) self.assertAllEqual(rt_save_path, save_path) sess.close() with self.session(config=default_config, use_gpu=test_util.is_gpu_available()) as sess: self.evaluate(variables.global_variables_initializer()) saver.restore(sess, save_path) s1 = self.evaluate([var.size() for var in var_list]) s2 = self.evaluate([tv.size() for tv in tstp_vars]) s3 = self.evaluate([sv.size() for sv in slot_vars]) self.assertAllGreater(s1, threshold * factor) self.assertAllGreater(s2, threshold * factor) if s3: self.assertAllGreater(s3, threshold * factor) saver.save(sess, save_path) sess.run(restrict_op) s1 = self.evaluate([var.size() for var in var_list]) s2 = self.evaluate([tv.size() for tv in tstp_vars]) s3 = self.evaluate([sv.size() for sv in slot_vars]) self.assertAllLess(s1, threshold * factor + 1) self.assertAllLess(s2, threshold * factor + 1) if s3: self.assertAllLess(s3, threshold * factor + 1) sess.close()
def common_run_context(self, var_list, opt_list, name): batch_size = 2 sample_length = 3 emb_domain_list = list() tws = list() cluster = ps_worker_cluster(ps_num=2) ps_servers, worker_servers, cluster_def = cluster config = config_pb2.ConfigProto( cluster_def=cluster_def, experimental=config_pb2.ConfigProto.Experimental( share_session_state_in_clusterspec_propagation=True, ), allow_soft_placement=False, inter_op_parallelism_threads=2, intra_op_parallelism_threads=2, gpu_options=config_pb2.GPUOptions(allow_growth=True), ) dev_placement = device_setter.replica_device_setter( ps_tasks=2, ps_device='/job:ps', worker_device='/job:worker', cluster=cluster_def, ) with ops.device(dev_placement): shared_var_0 = deo.get_variable('distributed_sp_var_0', initializer=0.0, devices=['/job:worker/task:0'], dim=8) shared_var_1 = deo.get_variable('distributed_sp_var_1', initializer=0.0, devices=['/job:worker/task:0'], dim=4) opt_list = get_multiple_optimizers() distributed_var_list = [shared_var_0, shared_var_1] for _v in distributed_var_list: ids = random_ops.random_uniform((batch_size, sample_length), maxval=1000000, dtype=_v.key_dtype) ids = array_ops.reshape(ids, (-1, )) _, tw = deo.embedding_lookup(_v, ids, return_trainable=True) tws.append(tw) _collapse = array_ops.reshape(tw, (batch_size, -1)) _logits = math_ops.reduce_sum(_collapse, axis=1) _logits = math_ops.cast(_logits, dtypes.float32) emb_domain_list.append(_logits) logits = math_ops.add_n(emb_domain_list) labels = array_ops.zeros((batch_size, ), dtype=dtypes.float32) loss = math_ops.reduce_mean( nn_impl.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, )) _train_ops = list() for _opt in opt_list: _train_ops.append(_opt.minimize(loss)) train_op = control_flow_ops.group(_train_ops) restrictor = dvr.VariableRestrictor(var_list=distributed_var_list, optimizer_list=opt_list) update_op = restrictor.update() threshold = int(batch_size * sample_length * 1.5) factor = 1.2 restrict_op = restrictor.restrict(threshold=threshold, factor=factor) policies = list(itertools.chain(*restrictor.policy_group.values())) tstp_vars = [policy.tstp_var for policy in policies] slot_vars = list() for tw in tws: for opt in opt_list: slot_vars += select_slot_vars(tw, opt) with session.Session(worker_servers[0].target, config=config) as sess: sess.run(variables.global_variables_initializer()) n, MAX_ITER = 0, 1000 while n < MAX_ITER: sess.run([train_op, update_op]) if all( sess.run(var.size()) > threshold * factor for var in distributed_var_list): break s1 = sess.run([var.size() for var in distributed_var_list]) s2 = sess.run([tv.size() for tv in tstp_vars]) s3 = sess.run([sv.size() for sv in slot_vars]) self.assertAllGreater(s1, threshold * factor) self.assertAllGreater(s2, threshold * factor) if s3: self.assertAllGreater(s3, threshold * factor) sess.run(restrict_op) s1 = sess.run([var.size() for var in distributed_var_list]) s2 = sess.run([tv.size() for tv in tstp_vars]) s3 = sess.run([sv.size() for sv in slot_vars]) self.assertAllLess(s1, threshold * factor + 1) self.assertAllLess(s2, threshold * factor + 1) if s3: self.assertAllLess(s3, threshold * factor + 1) sess.close()
def TFNCELoss(X, target_word, L): tf.compat.v1.disable_eager_execution() in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape) in_bias = tf.compat.v1.placeholder(tf.float32, shape=L.b.flatten().shape) in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.W.transpose().shape) in_target_word = tf.compat.v1.placeholder(tf.int64) in_neg_samples = tf.compat.v1.placeholder(tf.int32) in_target_prob = tf.compat.v1.placeholder(tf.float32) in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32) feed = { in_embed: X, in_weights: L.W.transpose(), in_target_word: target_word, in_bias: L.b.flatten(), in_neg_samples: L.derived_variables["noise_samples"][0], in_target_prob: L.derived_variables["noise_samples"][1], in_neg_samp_prob: L.derived_variables["noise_samples"][2], } nce_unreduced = tf.nn.nce_loss( weights=in_weights, biases=in_bias, labels=in_target_word, inputs=in_embed, sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob), num_sampled=L.num_negative_samples, num_classes=L.n_classes, ) loss = tf.reduce_sum(nce_unreduced) dLdW = tf.gradients(loss, [in_weights])[0] dLdb = tf.gradients(loss, [in_bias])[0] dLdX = tf.gradients(loss, [in_embed])[0] sampled_logits, sampled_labels = _compute_sampled_logits( weights=in_weights, biases=in_bias, labels=in_target_word, inputs=in_embed, sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob), num_sampled=L.num_negative_samples, num_classes=L.n_classes, num_true=1, subtract_log_q=True, ) sampled_losses = sigmoid_cross_entropy_with_logits(labels=sampled_labels, logits=sampled_logits) with tf.compat.v1.Session() as session: session.run(tf.compat.v1.global_variables_initializer()) ( _final_loss, _nce_unreduced, _dLdW, _dLdb, _dLdX, _sampled_logits, _sampled_labels, _sampled_losses, ) = session.run( [ loss, nce_unreduced, dLdW, dLdb, dLdX, sampled_logits, sampled_labels, sampled_losses, ], feed_dict=feed, ) tf.compat.v1.reset_default_graph() return { "final_loss": _final_loss, "nce_unreduced": _nce_unreduced, "dLdW": _dLdW, "dLdb": _dLdb, "dLdX": _dLdX, "out_logits": _sampled_logits, "out_labels": _sampled_labels, "sampled_loss": _sampled_losses, }
def cost_function(labels, logits, num_classes): sampled_losses = sigmoid_cross_entropy_with_logits( labels=labels, logits=logits, name="sampled_losses") return _sum_rows(sampled_losses)