def testPartitioners(self): partitioners = { "gamma": tf.fixed_size_partitioner(num_shards=2), "beta": tf.fixed_size_partitioner(num_shards=2), } inputs = tf.placeholder(tf.float32, shape=[None, 10]) ln = snt.LayerNorm(partitioners=partitioners) self.assertEqual(ln.partitioners, partitioners) ln(inputs) self.assertEqual(type(ln.gamma), variables.PartitionedVariable) self.assertEqual(type(ln.beta), variables.PartitionedVariable)
def testRecoverPartitionedVariableMap(self): with tf.variable_scope("test"): partitioner = tf.fixed_size_partitioner(3) tf.get_variable( initializer=tf.ones([11, 5]), name="partitioned_variable", partitioner=partitioner) tf.get_variable( initializer=tf.ones([11, 5]), name="normal_variable") all_vars = tf.global_variables() all_vars_dict = {var.op.name[5:]: var for var in all_vars} self.assertEqual(set(all_vars_dict.keys()), set([ "partitioned_variable/part_0", "partitioned_variable/part_1", "partitioned_variable/part_2", "normal_variable"])) self.assertEqual(len(all_vars_dict), 4) var_map = native_module.recover_partitioned_variable_map(all_vars_dict) self.assertEqual(set(var_map.keys()), set([ "partitioned_variable", "normal_variable"])) # Verify order of the partitioned variable list self.assertAllEqual( [v.op.name for v in var_map["partitioned_variable"]], [ "test/partitioned_variable/part_0", "test/partitioned_variable/part_1", "test/partitioned_variable/part_2", ])
def TestSuccess(self, connectivity, partitioning, fused, use_resource): params = { 'trainable': True, 'normalizer_fn': layers.batch_norm, 'normalizer_params': { 'scale': True, 'fused': fused } } partitioner = tf.fixed_size_partitioner(2) if partitioning else None with tf.variable_scope( tf.get_variable_scope(), partitioner=partitioner, use_resource=use_resource): with tf.contrib.framework.arg_scope( [layers.conv2d, layers.separable_conv2d], **params): build_model() sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, os.path.join(FLAGS.test_tmpdir, CKPT_FILE_NAME)) mapper = self.createMapper(connectivity) conv = get_op('conv1/Conv2D') sep_conv = get_op('sep_conv/separable_conv2d') with sess.as_default(): self.assertAllClose(CONV1_GAMMA, mapper.get_gamma(conv).eval()) self.assertAllClose(SEP_CONV_GAMMA, mapper.get_gamma(sep_conv).eval())
def testNoBatchNorm(self, connectivity, partitioning): partitioner = tf.fixed_size_partitioner(2) if partitioning else None with tf.variable_scope( tf.get_variable_scope(), partitioner=partitioner): build_model() mapper = self.createMapper(connectivity) conv = get_op('conv1/Conv2D') self.assertEqual(None, mapper.get_gamma(conv))
def testPartitioners(self): if tf.executing_eagerly(): self.skipTest("Partitioned variables are not supported in eager mode.") inputs = tf.ones( dtype=tf.float32, shape=[self.batch_size, self.in_size]) prev_state = tf.ones( dtype=tf.float32, shape=[self.batch_size, self.hidden_size]) with self.assertRaisesRegexp(KeyError, "Invalid partitioner keys.*"): snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, partitioners={"invalid": None}) err = "Partitioner for 'w' is not a callable function" with self.assertRaisesRegexp(TypeError, err): snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, partitioners={"in_to_hidden": {"w": tf.zeros([10, 10])}}) # Nested partitioners. valid_partitioners = { "in_to_hidden": { "w": tf.fixed_size_partitioner(num_shards=2), "b": tf.fixed_size_partitioner(num_shards=2), }, "hidden_to_hidden": { "w": tf.fixed_size_partitioner(num_shards=2), "b": tf.fixed_size_partitioner(num_shards=2), } } vanilla_rnn = snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, partitioners=valid_partitioners) vanilla_rnn(inputs, prev_state) self.assertEqual(type(vanilla_rnn.in_to_hidden_linear.w), variables.PartitionedVariable) self.assertEqual(type(vanilla_rnn.in_to_hidden_linear.b), variables.PartitionedVariable) self.assertEqual(type(vanilla_rnn.hidden_to_hidden_linear.w), variables.PartitionedVariable) self.assertEqual(type(vanilla_rnn.hidden_to_hidden_linear.b), variables.PartitionedVariable)
def testPartitioners(self, offset, scale): partitioners = {} if scale: partitioners["gamma"] = tf.fixed_size_partitioner(num_shards=2) if offset: partitioners["beta"] = tf.fixed_size_partitioner(num_shards=2) inputs_shape = [10, 10] inputs = tf.placeholder(tf.float32, shape=[None] + inputs_shape) bn = snt.BatchNorm(offset=offset, scale=scale, partitioners=partitioners) self.assertEqual(bn.partitioners, partitioners) bn(inputs, is_training=True) if scale: self.assertEqual(type(bn.gamma), variables.PartitionedVariable) if offset: self.assertEqual(type(bn.beta), variables.PartitionedVariable)
def testFixedSizePartitioner(self): with self.test_session(): partitioner = tf.fixed_size_partitioner(5, axis=0) with tf.variable_scope("root", partitioner=partitioner): v0 = tf.get_variable("v0", dtype=tf.float32, shape=(10, 10)) v0_list = v0._get_variable_list() v0_part = v0._get_partitions() self.assertEqual(len(v0_list), 5) self.assertAllEqual(v0_part, (5, 1))
def module_fn(): """A module summing one normal and one partitioned variable.""" partitioner = tf.fixed_size_partitioner(partitions) var_1 = tf.get_variable( initializer=tf.ones(shape), name="partitioned_variable", partitioner=partitioner) var_2 = tf.get_variable( initializer=tf.ones(shape), name="normal_variable") hub.add_signature(outputs=var_1 + var_2)
def module_with_variables(): tf.get_variable( name="weights", shape=[3], initializer=tf.zeros_initializer()) tf.get_variable( name="partition", shape=[4], initializer=tf.zeros_initializer(), partitioner=tf.fixed_size_partitioner(3))
def module_with_variables(): tf.get_variable( name="weights", shape=[3], initializer=tf.zeros_initializer()) tf.get_variable( name="partition", shape=[4], initializer=tf.zeros_initializer(), partitioner=tf.fixed_size_partitioner(3)) hub.add_signature(outputs=tf.constant(1.0))
def testPartitioners(self): partitioners = { "w": tf.fixed_size_partitioner(num_shards=2), "b": tf.fixed_size_partitioner(num_shards=2), } alex_net = snt.nets.AlexNetMini( partitioners=partitioners, name="alexnet1") input_shape = [alex_net._min_size, alex_net._min_size, 3] inputs = tf.placeholder(tf.float32, shape=[None] + input_shape) alex_net(inputs) for conv_module in alex_net.conv_modules: self.assertEqual(type(conv_module.w), variables.PartitionedVariable) self.assertEqual(type(conv_module.b), variables.PartitionedVariable) for linear_module in alex_net.linear_modules: self.assertEqual(type(linear_module.w), variables.PartitionedVariable) self.assertEqual(type(linear_module.b), variables.PartitionedVariable)
def testPartitioners(self, offset, scale): partitioners = {} if scale: partitioners["gamma"] = tf.fixed_size_partitioner(num_shards=2) if offset: partitioners["beta"] = tf.fixed_size_partitioner(num_shards=2) inputs_shape = [10, 10] inputs = tf.placeholder(tf.float32, shape=[None] + inputs_shape) bn = snt.BatchNormV2( offset=offset, scale=scale, partitioners=partitioners) self.assertEqual(bn.partitioners, partitioners) bn(inputs, is_training=True) if scale: self.assertLen(tf.global_variables("batch_norm/gamma"), 2) if offset: self.assertLen(tf.global_variables("batch_norm/beta"), 2)
def setUp(self): super(MLPTest, self).setUp() self.output_sizes = [11, 13, 17] self.batch_size = 5 self.input_size = 7 self.module_name = "mlp" self.initializers = { "w": tf.truncated_normal_initializer(stddev=1.0), } self.regularizers = { "w": tf.contrib.layers.l1_regularizer(scale=0.1), } self.partitioners = { "w": tf.fixed_size_partitioner(num_shards=2), }
def test_return_all_variables_from_checkpoint_with_partition(self): with tf.Graph().as_default(): partitioner = tf.fixed_size_partitioner(2) variables = [ tf.get_variable( name='weights', shape=(2, 2), partitioner=partitioner), tf.Variable([1.0, 2.0], name='biases') ] checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') init_op = tf.global_variables_initializer() saver = tf.train.Saver(variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) out_variables = variables_helper.get_variables_available_in_checkpoint( variables, checkpoint_path) self.assertItemsEqual(out_variables, variables)
def testInvalidDicts(self): batch_size = 3 # Mistake seen in the wild - https://github.com/deepmind/sonnet/issues/74 # Should actually be {'hidden_to_hidden': {'w': some_initializers(), ...}} initializers = {"hidden_to_hidden": tf.truncated_normal_initializer(0, 1)} vanilla_rnn = snt.VanillaRNN(hidden_size=23, initializers=initializers) with self.assertRaisesRegexp(TypeError, "Expected a dict"): vanilla_rnn(tf.zeros([batch_size, 4], dtype=tf.float32), vanilla_rnn.zero_state(batch_size, dtype=tf.float32)) # Error: should be a dict mapping strings to partitioners/regularizers. partitioners = tf.fixed_size_partitioner(num_shards=16) with self.assertRaisesRegexp(TypeError, "Expected a dict"): snt.LSTM(hidden_size=42, partitioners=partitioners) regularizers = tf.contrib.layers.l1_regularizer(scale=0.5) with self.assertRaisesRegexp(TypeError, "Expected a dict"): snt.GRU(hidden_size=108, regularizers=regularizers)
def testConcatOpGetRegularizer(self, use_batch_norm, use_partitioner): sc = self._batch_norm_scope() if use_batch_norm else [] partitioner = tf.fixed_size_partitioner(2) if use_partitioner else None with tf.contrib.framework.arg_scope(sc): with tf.variable_scope(tf.get_variable_scope(), partitioner=partitioner): final_op = op_regularizer_stub.build_model() op_reg_manager = orm.OpRegularizerManager([final_op], op_regularizer_stub.MOCK_REG_DICT) expected_alive = op_regularizer_stub.expected_alive() expected = np.logical_or(expected_alive['conv4'], expected_alive['concat']) with self.test_session(): conv_reg = op_reg_manager.get_regularizer(_get_op('conv4/Conv2D')) self.assertAllEqual(expected, conv_reg.alive_vector.eval()) relu_reg = op_reg_manager.get_regularizer(_get_op('conv4/Relu')) self.assertAllEqual(expected, relu_reg.alive_vector.eval())
def testSimpleOpGetRegularizer(self, use_batch_norm, use_partitioner, scope): # Tests the alive patern of the conv and relu ops. # use_batch_norm: A Boolean. Inidcats if batch norm should be used. # use_partitioner: A Boolean. Inidcats if a fixed_size_partitioner should be # used. # scope: A String. with the scope to test. sc = self._batch_norm_scope() if use_batch_norm else [] partitioner = tf.fixed_size_partitioner(2) if use_partitioner else None with tf.contrib.framework.arg_scope(sc): with tf.variable_scope(tf.get_variable_scope(), partitioner=partitioner): final_op = op_regularizer_stub.build_model() op_reg_manager = orm.OpRegularizerManager([final_op], op_regularizer_stub.MOCK_REG_DICT) expected_alive = op_regularizer_stub.expected_alive() with self.test_session(): conv_reg = op_reg_manager.get_regularizer(_get_op(scope + '/Conv2D')) self.assertAllEqual(expected_alive[scope], conv_reg.alive_vector.eval()) relu_reg = op_reg_manager.get_regularizer(_get_op(scope + '/Relu')) self.assertAllEqual(expected_alive[scope], relu_reg.alive_vector.eval())
def build_embedding(params, num_shards): feature_conf = params['feature_conf'] feature_list_conf = params['feature_list'] feature_list = [ feature_list_conf[key] for key in sorted(feature_list_conf, reverse=False) ] model_conf = params['model_conf'] vocabulary_conf = params['vocabulary_conf'] embed_dim = model_conf['embed_dim'] first_order = int(model_conf['first_order']) partitioner = tf.fixed_size_partitioner( num_shards) if num_shards > 1 else None table = OrderedDict() sparse = [] deep = OrderedDict() multi = OrderedDict() model_struct = defaultdict(list) numeric = [] dense = [] dense_tag = [] wide_dim, deep_dim, deep_num, cate_num, con_num, all_num, con_deep_num, con_bias_num = 0, 0, 0, 0, 0, 0, 0, 0 for feature in feature_list: if not feature in feature_conf: continue conf = feature_conf[feature] if conf['ignore']: continue f_type, f_tran, f_param = conf['type'], conf['transform'], conf[ 'parameter'] if 'group' in conf: for struct in conf['group']: model_struct[struct].append(feature) f_multi = conf['multi'] if 'multi' in conf else { 'num': 1, 'same': True, 'combiner': 'none' } feature_name = f_param['name'] if 'name' in f_param else feature feature_embed_dim = f_param[ 'embed_dim'] if 'embed_dim' in f_param else embed_dim feature_scope = f_param[ 'scope'] if 'scope' in f_param else 'embedding' with tf.variable_scope(feature_scope, reuse=tf.AUTO_REUSE, partitioner=partitioner) as scope: if f_type == 'category': f_num, combiner = f_multi['num'], f_multi['combiner'] default_value = f_param[ 'default'] if 'default' in f_param else 0 if combiner != 'none' and f_num >= 1: f_num = 1 if f_tran == 'vocabulary_list': vocabulary = vocabulary_conf[feature] vocabulary = ['DEFAULT'] + vocabulary table.update({ feature: lookup.index_table_from_tensor( mapping=tf.constant(vocabulary), default_value=default_value) }) f_dim = len(vocabulary) * f_num f_size = len(vocabulary) fill_value = f_param[ 'fill'] if 'fill' in f_param else '' #'DEFAULT' elif f_tran == 'tabled': f_dim = f_param['size'] * f_num f_size = f_param['size'] fill_value = f_param[ 'fill'] if 'fill' in f_param else '' #'0' else: assert False, 'only support category features with vocabulary or tabled' if 'onehot' in conf['style']: sparse.append(feature) if f_num >= 1: wide_dim += f_dim * f_num else: wide_dim += f_dim * (-f_num) if 'embedding' in conf['style']: deep.update({ feature: tf.get_variable( initializer=tf.random.normal( [f_size, feature_embed_dim + first_order], 0.0, 0.1), name='{}_embedding'.format(feature_name)) }) if f_num >= 1: deep_dim += (feature_embed_dim + first_order) * f_num deep_num += f_num else: deep_dim += (feature_embed_dim + first_order) * (-f_num) deep_num += -f_num f_multi['same'] = False dense_tag += [0] * abs(f_num) cate_num += abs(f_num) all_num += abs(f_num) tail_value = f_param[ 'tail'] if 'tail' in f_param else f_size elif f_type == 'numeric': f_size = 1 f_num = f_multi['num'] numeric.append(feature) if 'value' in conf['style']: dense.append(feature) if 'embedding' in conf['style']: if f_num >= 1: if f_multi['combiner'] == 'none': if f_multi['same']: deep.update({ feature: tf.get_variable( initializer=tf.random.normal( [1, embed_dim + first_order], 0.0, 0.1), name='{}'.format(feature_name)) }) else: deep.update({ feature: tf.get_variable( initializer=tf.random.normal([ f_num, embed_dim + first_order ], 0.0, 0.1), name='{}'.format(feature_name)) }) con_num += f_num all_num += f_num con_deep_num += f_num deep_num += f_num deep_dim += (embed_dim + first_order) * f_num dense_tag += [1] * f_num else: con_num += 1 all_num += 1 if f_multi['same']: deep.update({ feature: tf.get_variable( initializer=tf.random.normal( [1, embed_dim + first_order], 0.0, 0.1), name='{}'.format(feature_name)) }) else: deep.update({ feature: tf.get_variable( initializer=tf.random.normal([ f_num, embed_dim + first_order ], 0.0, 0.1), name='{}'.format(feature_name)) }) con_deep_num += 1 deep_num += 1 deep_dim += embed_dim + first_order dense_tag.append(1) else: if f_multi['combiner'] == 'none': f_num = -f_num if f_multi['same']: deep.update({ feature: tf.get_variable( initializer=tf.random.normal( [1, embed_dim + first_order], 0.0, 0.1), name='{}_embedding'.format( feature_name)) }) else: deep.update({ feature: tf.get_variable( initializer=tf.random.normal([ f_num, embed_dim + first_order ], 0.0, 0.1), name='{}_embedding'.format( feature)) }) con_num += f_num all_num += f_num con_deep_num += f_num deep_num += f_num deep_dim += (embed_dim + first_order) * f_num dense_tag += [1] * f_num else: f_num -= f_num con_num += 1 all_num += 1 if f_multi['same']: deep.update({ feature: tf.get_variable( initializer=tf.random.normal( [1, embed_dim + first_order], 0.0, 0.1), name='{}_embedding'.format( feature)) }) else: deep.update({ feature: tf.get_variable( initializer=tf.random.normal([ f_num, embed_dim + first_order ], 0.0, 0.1), name='{}_embedding'.format( feature)) }) con_deep_num += 1 deep_num += 1 deep_dim += embed_dim + first_order dense_tag.append(1) f_dim = -1 default_value = 0 fill_value = f_param[ 'fill'] if 'fill' in f_param else '' #'0' tail_value = f_param['tail'] if 'tail' in f_param else 0 else: assert False, "cant't handle this type now: {}".format( f_type) multi.update({ feature: (f_type, f_multi['num'], f_size, f_multi['combiner'], f_multi['same'], default_value, fill_value, tail_value) }) dims = { 'deep_num': deep_num, 'deep_dim': deep_dim, 'wide_dim': wide_dim, 'con_num': con_num, 'cate_num': cate_num, 's_embed_size': embed_dim, 'cate_deep_num': deep_num - con_deep_num, 'd_embed_size': embed_dim, 'all_num': all_num, 'dense_tag': dense_tag, 'con_deep_num': con_deep_num } columns = { 'table': table, 'sparse': sparse, 'deep': deep, 'dense': dense, 'numeric': numeric, 'dense_tag': dense_tag, 'multi': multi } # dense_tag = tf.constant(dense_tag) return model_struct, columns, dims
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): partitioner = tf.fixed_size_partitioner(len(ps_hosts), axis=0) with tf.variable_scope('partitioned_space', partitioner=partitioner): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000 * 350.0 / FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) labels = tf.one_hot(labels, 10, 1, 0) # network_fn = nets_factory.get_network_fn('alexnet_v2',num_classes=10) # (logits,_) = network_fn(inputs) # with slim.arg_scope(alexnet.alexnet_v2_arg_scope(weight_decay=0.0)): (logits, _) = alexnet.alexnet_v2(inputs, num_classes=10, is_training=True) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE * len(worker_hosts), global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) naive_grads = opt.compute_gradients(loss) grads = [(tf.scalar_mul( tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var) for grad, var in naive_grads] apply_gradients_op = opt.apply_gradients( grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, saver=saver, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess_config.gpu_options.allow_growth = True sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) """Train CIFAR-10 for a number of steps.""" time0 = time.time() batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) _, loss_value, gs = sess.run( [train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() if step % 1 == 0: duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( "time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
def model_fn(features, labels, mode, params, config): visit_items_index = features["visit_items_index"] # num * 5 continuous_features_value = features["continuous_features_value"] # num * 16 next_visit_item_index = labels # num keep_prob = params["keep_prob"] embedding_size = params["embedding_size"] item_num = params["item_num"] learning_rate = params["learning_rate"] top_k = params["top_k"] # items embedding 初始化 initializer = tf.initializers.random_uniform(minval=-0.5 / embedding_size, maxval=0.5 / embedding_size) partitioner = tf.fixed_size_partitioner(num_shards=embedding_size) item_embedding = tf.get_variable("item_embedding", [item_num, embedding_size], tf.float32, initializer=initializer, partitioner=partitioner) visit_items_embedding = tf.nn.embedding_lookup(item_embedding, visit_items_index) # num * 5 * embedding_size visit_items_average_embedding = tf.reduce_mean(visit_items_embedding, axis=1) # num * embedding_size input_embedding = tf.concat([visit_items_average_embedding, continuous_features_value], 1) # num * (embedding_size + 16) kernel_initializer_1 = tf.initializers.random_normal(mean=0.0, stddev=0.1) bias_initializer_1 = tf.initializers.random_normal(mean=0.0, stddev=0.1) layer_1 = tf.layers.dense(input_embedding, 64, activation=tf.nn.relu, kernel_initializer=kernel_initializer_1, bias_initializer=bias_initializer_1, name="layer_1") layer_dropout_1 = tf.nn.dropout(layer_1, keep_prob=keep_prob, name="layer_dropout_1") kernel_initializer_2 = tf.initializers.random_normal(mean=0.0, stddev=0.1) bias_initializer_2 = tf.initializers.random_normal(mean=0.0, stddev=0.1) layer_2 = tf.layers.dense(layer_dropout_1, 32, activation=tf.nn.relu, kernel_initializer=kernel_initializer_2, bias_initializer=bias_initializer_2, name="layer_2") layer_dropout_2 = tf.nn.dropout(layer_2, keep_prob=keep_prob, name="layer_dropout_2") # user vector, num * embedding_size kernel_initializer_3 = tf.initializers.random_normal(mean=0.0, stddev=0.1) bias_initializer_3 = tf.initializers.random_normal(mean=0.0, stddev=0.1) user_vector = tf.layers.dense(layer_dropout_2, embedding_size, activation=tf.nn.relu, kernel_initializer=kernel_initializer_3, bias_initializer=bias_initializer_3, name="user_vector") if mode == tf.estimator.ModeKeys.TRAIN: # 训练 output_embedding = tf.nn.embedding_lookup(item_embedding, next_visit_item_index) # num * embedding_size logits = tf.matmul(user_vector, output_embedding, transpose_a=False, transpose_b=True) # num * num yhat = tf.nn.softmax(logits) # num * num cross_entropy = tf.reduce_mean(-tf.log(tf.matrix_diag_part(yhat) + 1e-16)) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train = optimizer.minimize(cross_entropy, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=cross_entropy, train_op=train) if mode == tf.estimator.ModeKeys.EVAL: # 评估 output_embedding = tf.nn.embedding_lookup(item_embedding, next_visit_item_index) # num * embedding_size logits = tf.matmul(user_vector, output_embedding, transpose_a=False, transpose_b=True) # num * num yhat = tf.nn.softmax(logits) # num * num cross_entropy = tf.reduce_mean(-tf.log(tf.matrix_diag_part(yhat) + 1e-16)) return tf.estimator.EstimatorSpec(mode, loss=cross_entropy) if mode == tf.estimator.ModeKeys.PREDICT: logits_predict = tf.matmul(user_vector, item_embedding, transpose_a=False, transpose_b=True) # num * item_num yhat_predict = tf.nn.softmax(logits_predict) # num * item_num _, indices = tf.nn.top_k(yhat_predict, k=top_k, sorted=True) index = tf.identity(indices, name="index") # num * top_k # 预测 predictions = { "user_vector": user_vector, "index": index } export_outputs = { "prediction": tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
def build_inference(self, x, flag="train"): # 设置regularizer,本别对应网络的四个部分 regularizer1 = self.param_dict[ "regulerizer1"] if flag == "train" else None regularizer2 = self.param_dict[ "regulerizer2"] if flag == "train" else None regularizer3 = self.param_dict[ "regulerizer3"] if flag == "train" else None regularizer4 = self.param_dict[ "regulerizer4"] if flag == "train" else None is_train = True if flag == "train" else False # 先获取需要的参数 hash_size = self.param_dict['hash_size'] no_hash = self.param_dict["no_hash"] embed_size = self.param_dict["embed_size"] # 根据配置获取激活函数 act_fn = self.get_activation_func(is_train) # 是否启用mini-batch aware regularization is_mba_reg = self.param_dict["is_mba_reg"] lambda_reg_mba = self.param_dict["lambda_reg_mba"] is_action_mba_reg = self.param_dict["is_action_mba_reg"] # 将输入划分 x_feature = x[:, :-3] x_action_lists = x[:, -3:] # 先将稀疏特征转换成indice x_sparse = [] for i in range(len(hash_size)): if i in no_hash: # 这部分特征本身可以直接作为indice,不需要转化 x_i = tf.string_to_number(x_feature[:, i], tf.int32) x_sparse.append(x_i) else: # 这部分特征可以通过哈希函数来转化成index x_i = tf.string_to_hash_bucket_strong( input=x_feature[:, i], num_buckets=hash_size[i], key=[679362, 964545], name="sparse_feature_{}".format(i)) x_sparse.append(x_i) # 将稀疏数据转换成embedding向量 x_embed = [] w_action_embed = [] x_action = [] indice_sku_cate_brand = [] sku_cate_brand_index = self.param_dict["sku_cate_brand_index"] for i in range(len(embed_size)): if embed_size[i] != -1: with tf.variable_scope("embedding_{}".format(i)): if hash_size[i] <= 500000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]])) elif hash_size[i] > 500000 and hash_size[i] <= 5000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(5, 0)) elif hash_size[i] > 5000000 and hash_size[i] <= 10000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) elif hash_size[i] > 10000000 and hash_size[i] <= 15000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(15, 0)) elif hash_size[i] > 15000000 and hash_size[i] <= 20000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(20, 0)) else: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(30, 0)) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 w_action_embed.append(weights) x_action.append(x_i) indice_sku_cate_brand.append(x_sparse[i]) if is_train and is_mba_reg and not is_action_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) else: if is_train and is_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) else: x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) x_embed.append(x_i) # if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 # with tf.variable_scope("embedding_{}".format(i)): # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]), # partitioner=tf.fixed_size_partitioner(20, 0)) # w_action_embed.append(weights) # x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) # if is_train and is_mba_reg and not is_action_mba_reg: # # 计算mba # self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba) # # indice_sku_cate_brand.append(x_sparse[i]) # x_embed.append(x_i) # x_action.append(x_i) # else: # if embed_size[i] != -1: # with tf.variable_scope("embedding_{}".format(i)): # if i == 0: # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]), # partitioner=tf.fixed_size_partitioner(20, 0)) # else: # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]])) # x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) # if is_train and is_mba_reg: # # 计算mba # self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba) # # x_embed.append(x_i) # else: # x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) # x_embed.append(x_i) x_embed = tf.concat(x_embed, 1) # 对浏览行为建模,构建DIN with tf.name_scope("user_behaviours"): x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [ -1, ]) x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [ -1, ]) x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [ -1, ]) browse_lists = [ x_browse_skus_list, x_browse_cates_list, x_browse_brand_list ] browse_names = ['skus', 'cates', 'brands'] browse_nums = self.param_dict["browse_nums"] x_action_list_embeds = [] sum_poolings = [] x_action_list_masks = [] for i in range(len(browse_names)): # for i in [0]: with tf.name_scope("user_browse_{}_embedding".format( browse_names[i])): browse_w_embed = w_action_embed[i] # x_ad_embedded = x_action[i] x_browse_action = browse_lists[ i] # shape of x_browse_action is [?,] x_browse_action_list = tf.string_split( x_browse_action, "#") x_browse_action_list_indices = tf.sparse_to_dense( x_browse_action_list.indices, # x_browse_action_list.dense_shape, [x_browse_action_list.dense_shape[0], browse_nums[i]], tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape()[0].value, key=[679362, 964545], name="sparse_user_browse_{}".format( browse_names[i])), -1) indice_mask = tf.reshape( tf.not_equal(x_browse_action_list_indices, -1), [-1, browse_nums[i]]) x_action_list_masks.append(indice_mask) x_action_list_embed = tf.reshape( tf.nn.embedding_lookup(browse_w_embed, x_browse_action_list_indices), [ -1, browse_nums[i], browse_w_embed.get_shape()[1].value ]) if is_train and is_action_mba_reg: # 计算mba indice_action = tf.concat([ tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545]), indice_sku_cate_brand[i] ], 0) self.calculate_mini_batch_aware_reg( browse_w_embed, indice_action, lambda_reg_mba) x_action_list_embeds.append(x_action_list_embed) with tf.name_scope("activation_unit"): act_unit_hidden_layers = self.param_dict[ "act_unit_hidden_layers"] action_indexs = self.param_dict["action_indexs"] # for i in range(len(x_action_list_embeds)): for i in action_indexs: x_action_list_embed = x_action_list_embeds[i] x_ad_embedded = x_action[i] indice_mask = x_action_list_masks[i] # 外积:笛卡尔积矩阵拉平向量 # out_product_list = tf.map_fn(lambda action_emb: tf.reshape(tf.matmul(tf.expand_dims(action_emb, 2), tf.expand_dims(x_ad_embedded, 1)), [-1, x_ad_embedded.shape[1].value ** 2]), # tf.transpose(x_action_list_embed, [1, 0, 2])) # 近似外积:向量相减再concat向量点积 x_action_list_embed_new = tf.transpose( x_action_list_embed, [1, 0, 2]) concat_list = [ tf.concat([ x_action_list_embed_new[ii], x_action_list_embed_new[ii] - x_ad_embedded, x_action_list_embed_new[ii] * x_ad_embedded, x_ad_embedded ], 1) for ii in range(x_action_list_embed_new.shape[0].value) ] act_unit_in = concat_list[0].shape[1].value act_in = concat_list with tf.variable_scope("activation_unit_{}_list".format( browse_names[i])): for ii in range(len(act_unit_hidden_layers)): weights_act_unit = self.get_weight_variable( [act_unit_in, act_unit_hidden_layers[ii]], regularizer3, self.param_dict["initializer_act_unit_w"]( [act_unit_in, act_unit_hidden_layers[ii]]), name='_act_unit_w_{}'.format(ii)) biases_act_unit = tf.get_variable( "biases_{}_act_unit".format(ii), [act_unit_hidden_layers[ii]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) act_out = list( map( lambda act_in_i: act_fn( tf.matmul(act_in_i[0], weights_act_unit ) + biases_act_unit, name="act_func_{}_{}".format( ii, act_in_i[1])), zip(act_in, range(len(act_in))))) # act_out = [tf.expand_dims(act_fn(tf.matmul(act_in[ii], weights_act_unit) + biases_act_unit, name="act_func_{}_{}".format(i, ii)), 0) # for ii in range(act_in.shape[0].value)] act_in = act_out act_unit_in = act_in[0].shape[1].value act_output_in = act_in act_output_unit = act_unit_in weights_act_unit_output = self.get_weight_variable( [act_output_unit, 1], regularizer3, self.param_dict["initializer_act_unit_w"]( [act_output_unit, 1]), name='_act_unit_output_w') biases_act_unit_output = tf.get_variable( "biases_act_unit_output", [1], initializer=tf.constant_initializer(0.0), dtype=tf.float32) act_output_out = tf.concat( list( map( lambda act_output_i: tf.expand_dims( tf.matmul(act_output_i, weights_act_unit_output) + biases_act_unit_output, 0), act_output_in)), 0) # act_output_out = tf.concat([tf.expand_dims(tf.matmul(act_output_in[iii], weights_act_unit_output) + biases_act_unit_output, 0) for iii in range(act_output_in.shape[0].value)], 0) active_weight_score = tf.transpose(act_output_out, [1, 0, 2]) # 将空缺行为的权重设置为0.0 padding = tf.zeros_like(active_weight_score) active_weight_score_t = tf.where( tf.expand_dims(indice_mask, 2), active_weight_score, padding) with tf.name_scope("weight_sum_pooling"): sum_pooling = tf.reduce_sum( x_action_list_embed * active_weight_score_t, 1) sum_poolings.append(sum_pooling) x_deep_in = tf.concat([x_embed, tf.concat(sum_poolings, 1)], 1) # 构建deep模块 with tf.name_scope("deep_network"): deep_layers = self.param_dict["deep_layers"] for i in range(len(deep_layers)): with tf.variable_scope("dnn_layer_{}".format(i)): weights = self.get_weight_variable( [x_deep_in.shape[1].value, deep_layers[i]], regularizer2, self.param_dict["initializer_dnn_w"]( [x_deep_in.shape[1].value, deep_layers[i]])) biases = tf.get_variable( "biases", [deep_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases, name="deep_mlp_{}".format(i)) x_deep_in = layer_i # 构建输出模块full connect x_fc_in = x_deep_in with tf.name_scope("fc_layers"): fc_layers = self.param_dict['fc_layers'] for i in range(len(fc_layers)): with tf.variable_scope("fc_layers_{}".format(i)): weights = self.get_weight_variable( [x_fc_in.shape[1].value, fc_layers[i]], regularizer4, self.param_dict["initializer_fc_w"]( [x_fc_in.shape[1].value, fc_layers[i]])) biases = tf.get_variable( "biases", [fc_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = tf.nn.sigmoid( tf.matmul(x_fc_in, weights) + biases) x_fc_in = layer_i logit = x_fc_in return logit
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): partitioner = tf.fixed_size_partitioner(len(ps_hosts), axis=0) with tf.variable_scope('root', partitioner=partitioner): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000 * 350.0 / FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) re = tf.shape(images)[0] inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) labels = tf.one_hot(labels, 10, 1, 0) network_fn = nets_factory.get_network_fn('vgg_16', num_classes=10) (logits, _) = network_fn(inputs) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) train_op = cifar10.train(loss, global_step) sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, saver=None, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess_config.gpu_options.allow_growth = True # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) """Train CIFAR-10 for a number of steps.""" batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) _, loss_value, gs = sess.run( [train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( "time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
#assign tasks in round-robin fashion W1 = tf.get_variable('weights_1', [784, 100]) b1 = tf.get_variable('biases_1', [100]) W1 = tf.get_variable('weights_2', [100, 10]) b1 = tf.get_variable('biases_2', [10]) greedy = tf.contrib.training.GreedyLoadBalancingStrategy(_) with tf.device(tf.train.replica_device_setter(ps_tasks=3, ps_strategy=greedy)): #assign tasks in round-robin fashion W1 = tf.get_variable('weights_1', [784, 100]) b1 = tf.get_variable('biases_1', [100]) W1 = tf.get_variable('weights_2', [100, 10]) b1 = tf.get_variable('biases_2', [10]) embedding = tf.get_variable(embedding, [1000000000, 20], partitioner=tf.fixed_size_partitioner(3)) saver = tf.train.Saver(sharded=True) #each PS task writed in parallel, this is not by default #distributed code for a worker task cluster = tf.train.ClusterSpec({ "workers": ["192.168.0.1:2222", ...], "ps": ["192.168.1.1:2222", ...] }) #cluster mamager called Borg server = tf.train.Server(cluster, job_name="worker", task_index=0) #server represents a particular task with tf.Session(server.target) as sess: ... if is_chief and step % 1000 == 0:
def bayesianesque_embeddings(features, labels, mode, params): """ Note: Labels will be max lengths. """ seq_len = features["lens"] raw_seqs = features["seqs"] pdrop = params["pdrop"] if mode == tf.estimator.ModeKeys.TRAIN else 0.0 # TODO add partitioners to these mu_embed = tf.get_variable( "mean_embed", shape=[params["vocab_size"], params["embed_dim"]], dtype=tf.float32, partitioner=tf.fixed_size_partitioner(params["num_shards"])) mean_embedded_input = tf.nn.embedding_lookup(mu_embed, raw_seqs, partition_strategy="div") if not params["raw_word2vec"]: cov_embed = tf.get_variable( "cov_embed", shape=[ params["vocab_size"], params["variance_size"], params["embed_dim"] ], partitioner=tf.fixed_size_partitioner(params["num_shards"])) cov_embed_input = tf.nn.embedding_lookup(cov_embed, raw_seqs, partition_strategy="div") transformer_in = add_timing_signal_1d(mean_embedded_input) h = block(transformer_in, params["n_heads"], seq_len, pdrop, "trans_block") cov = mlp(h, "mlp", params["embed_dim"] * 2, pdrop, nx=params["variance_size"]) # batch, seq, cov_dim mean, variance = tf.nn.moments(cov, [0, 1]) divergence_loss = tf.reduce_mean(tf.abs(mean) + tf.abs(variance - 1.0)) # TODO minimise divergence embedding = mean_embedded_input + tf.reduce_sum( tf.expand_dims(cov, 3) * cov_embed_input, 2) # batch, seq_len, embed_dim else: embedding = mean_embedded_input divergence_loss = 0.0 seq_len_with_pad = tf.shape(embedding)[1] embed_mask = tf.expand_dims( tf.sequence_mask(seq_len, seq_len_with_pad, dtype=tf.float32), -1) if mode == tf.estimator.ModeKeys.PREDICT: embedding *= embed_mask embedding = tf.reduce_mean(embedding, axis=1) return tf.estimator.EstimatorSpec(mode=mode, predictions=embedding) if params["reweight"]: freqs = tf.gather(params["frequencies"], raw_seqs) weights = tf.sqrt(1 / freqs) embedding = class_reweighting(tf.expand_dims(weights, -1))(embedding) output_embed = tf.get_variable( "output_embed", shape=[params["vocab_size"], params["embed_dim"]], dtype=tf.float32, partitioner=tf.fixed_size_partitioner(params["num_shards"])) output_bias = tf.get_variable("output_bias", shape=[params["vocab_size"]], dtype=tf.float32) if params["reconstruction_loss"]: reconstruction_loss = decoder( embedding, raw_seqs, tf.reduce_mean(embedding * embed_mask, axis=1), output_embed, output_bias, params["n_heads"], params["num_decoder_blocks"], pdrop, seq_len, params["vocab_size"], params["sampled_softmax_size"]) else: reconstruction_loss = 0.0 window = params["window_size"] pf = tf.pad(raw_seqs, [[0, 0], [window, window]], constant_values=params["pad_id"]) targets = tf.map_fn(lambda i: tf.concat( (pf[:, i - window:i], pf[:, i + 1:i + 1 + window]), axis=-1), tf.range(window, window + seq_len_with_pad), dtype=tf.int32) embedding = tf.reshape(embedding, shape=[-1, params["embed_dim"]]) mask = tf.reshape( tf.sequence_mask(seq_len, seq_len_with_pad, dtype=tf.float32), [-1]) # targets = tf.Print(targets, [tf.reshape(targets, shape=[-1, window * 2])[0], raw_seqs[0], mask], summarize=1000) loss = tf.nn.nce_loss( output_embed, output_bias, tf.reshape(targets, shape=[-1, window * 2]), embedding, params["sampled_softmax_size"], params["vocab_size"], num_true=window * 2, remove_accidental_hits=True, partition_strategy='div', name='nce_loss', ) loss = tf.reduce_sum(loss * mask) / tf.reduce_sum(mask) tf.summary.scalar("Reconstruction", reconstruction_loss) tf.summary.scalar("MainLoss", loss) tf.summary.scalar("DivergenceLoss", divergence_loss) loss = loss + reconstruction_loss * params["reconstruction_weight"] if mode == tf.estimator.ModeKeys.TRAIN: lr = noam_lr(params.get("learning_rate", None), params["embed_dim"], tf.train.get_or_create_global_step(), params["warmup_steps"]) optimizer = params.get("optimizer") train_op = tf.contrib.layers.optimize_loss( learning_rate=lr, loss=tf.reduce_mean(loss + divergence_loss * params["divergence_weight"]), global_step=tf.train.get_global_step(), optimizer=optimizer, clip_gradients=1.0, summaries=[ "learning_rate", "loss", "gradients", "gradient_norm", "global_gradient_norm", ]) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) assert mode == tf.estimator.ModeKeys.EVAL return tf.estimator.EstimatorSpec(mode=mode, loss=loss)
def inception_v3(inputs, dropout_keep_prob=0.8, num_classes=1000, is_training=True, restore_logits=True, scope=''): """Latest Inception from http://arxiv.org/abs/1512.00567. "Rethinking the Inception Architecture for Computer Vision" Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna Args: inputs: a tensor of size [batch_size, height, width, channels]. dropout_keep_prob: dropout keep_prob. num_classes: number of predicted classes. is_training: whether is training or not. restore_logits: whether or not the logits layers should be restored. Useful for fine-tuning a model with different num_classes. scope: Optional scope for name_scope. Returns: a list containing 'logits', 'aux_logits' Tensors. """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} partitioner=tf.fixed_size_partitioner(2, axis=0) with tf.name_scope(scope, 'inception_v3', [inputs]): with scopes.arg_scope([ops.conv2d, ops.fc, ops.batch_norm, ops.dropout], is_training=is_training): with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool], stride=1, padding='VALID'): # 299 x 299 x 3 end_points['conv0'] = ops.conv2d(inputs, 32, [3, 3], stride=2, scope='conv0') # 149 x 149 x 32 end_points['conv1'] = ops.conv2d(end_points['conv0'], 32, [3, 3], scope='conv1') # 147 x 147 x 32 end_points['conv2'] = ops.conv2d(end_points['conv1'], 64, [3, 3], padding='SAME', scope='conv2') # 147 x 147 x 64 end_points['pool1'] = ops.max_pool(end_points['conv2'], [3, 3], stride=2, scope='pool1') # 73 x 73 x 64 end_points['conv3'] = ops.conv2d(end_points['pool1'], 80, [1, 1], scope='conv3') # 73 x 73 x 80. end_points['conv4'] = ops.conv2d(end_points['conv3'], 192, [3, 3], scope='conv4') # 71 x 71 x 192. end_points['pool2'] = ops.max_pool(end_points['conv4'], [3, 3], stride=2, scope='pool2') # 35 x 35 x 192. net = end_points['pool2'] # Inception blocks with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool], stride=1, padding='SAME'): # mixed: 35 x 35 x 256. with tf.variable_scope('mixed_35x35x256a'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 64, [1, 1]) with tf.variable_scope('branch5x5'): branch5x5 = ops.conv2d(net, 48, [1, 1]) branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 32, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool]) end_points['mixed_35x35x256a'] = net # mixed_1: 35 x 35 x 288. with tf.variable_scope('mixed_35x35x288a'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 64, [1, 1]) with tf.variable_scope('branch5x5'): branch5x5 = ops.conv2d(net, 48, [1, 1]) branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 64, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool]) end_points['mixed_35x35x288a'] = net # mixed_2: 35 x 35 x 288. with tf.variable_scope('mixed_35x35x288b'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 64, [1, 1]) with tf.variable_scope('branch5x5'): branch5x5 = ops.conv2d(net, 48, [1, 1]) branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 64, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool]) end_points['mixed_35x35x288b'] = net # mixed_3: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768a'): with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 384, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch_pool'): branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID') net = tf.concat(axis=3, values=[branch3x3, branch3x3dbl, branch_pool]) end_points['mixed_17x17x768a'] = net # mixed4: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768b'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 128, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 128, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 128, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) end_points['mixed_17x17x768b'] = net # mixed_5: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768c'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 160, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 160, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 160, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) end_points['mixed_17x17x768c'] = net # mixed_6: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768d'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 160, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 160, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 160, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) end_points['mixed_17x17x768d'] = net # mixed_7: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768e'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 192, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 192, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 192, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) end_points['mixed_17x17x768e'] = net # Auxiliary Head logits aux_logits = tf.identity(end_points['mixed_17x17x768e']) with tf.variable_scope('aux_logits'): aux_logits = ops.avg_pool(aux_logits, [5, 5], stride=3, padding='VALID') aux_logits = ops.conv2d(aux_logits, 128, [1, 1], scope='proj') # Shape of feature map before the final layer. shape = aux_logits.get_shape() aux_logits = ops.conv2d(aux_logits, 768, shape[1:3], stddev=0.01, padding='VALID') aux_logits = ops.flatten(aux_logits) aux_logits = ops.fc(aux_logits, num_classes, activation=None, stddev=0.001, restore=restore_logits) end_points['aux_logits'] = aux_logits # mixed_8: 8 x 8 x 1280. # Note that the scope below is not changed to not void previous # checkpoints. # (TODO) Fix the scope when appropriate. with tf.variable_scope('mixed_17x17x1280a'): with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 192, [1, 1]) branch3x3 = ops.conv2d(branch3x3, 320, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch7x7x3'): branch7x7x3 = ops.conv2d(net, 192, [1, 1]) branch7x7x3 = ops.conv2d(branch7x7x3, 192, [1, 7]) branch7x7x3 = ops.conv2d(branch7x7x3, 192, [7, 1]) branch7x7x3 = ops.conv2d(branch7x7x3, 192, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch_pool'): branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID') net = tf.concat(axis=3, values=[branch3x3, branch7x7x3, branch_pool]) end_points['mixed_17x17x1280a'] = net # mixed_9: 8 x 8 x 2048. with tf.variable_scope('mixed_8x8x2048a'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 320, [1, 1]) with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 384, [1, 1]) branch3x3 = tf.concat(axis=3, values=[ops.conv2d(branch3x3, 384, [1, 3]), ops.conv2d(branch3x3, 384, [3, 1])]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 448, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3]) branch3x3dbl = tf.concat(axis=3, values=[ops.conv2d(branch3x3dbl, 384, [1, 3]), ops.conv2d(branch3x3dbl, 384, [3, 1])]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch3x3, branch3x3dbl, branch_pool]) end_points['mixed_8x8x2048a'] = net # mixed_10: 8 x 8 x 2048. with tf.variable_scope('mixed_8x8x2048b'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 320, [1, 1]) with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 384, [1, 1]) branch3x3 = tf.concat(axis=3, values=[ops.conv2d(branch3x3, 384, [1, 3]), ops.conv2d(branch3x3, 384, [3, 1])]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 448, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3]) branch3x3dbl = tf.concat(axis=3, values=[ops.conv2d(branch3x3dbl, 384, [1, 3]), ops.conv2d(branch3x3dbl, 384, [3, 1])]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat(axis=3, values=[branch1x1, branch3x3, branch3x3dbl, branch_pool]) end_points['mixed_8x8x2048b'] = net # Final pooling and prediction with tf.variable_scope('logits'): shape = net.get_shape() net = ops.avg_pool(net, shape[1:3], padding='VALID', scope='pool') # 1 x 1 x 2048 net = ops.dropout(net, dropout_keep_prob, scope='dropout') net = ops.flatten(net, scope='flatten') # 2048 logits = ops.fc(net, num_classes, activation=None, scope='logits', restore=restore_logits) # 1000 end_points['logits'] = logits end_points['predictions'] = tf.nn.softmax(logits, name='predictions') return logits, end_points
def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_enc_partitions=0, num_dec_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, use_char_encode=False, scope=None): if num_enc_partitions <= 1: enc_partitioner = None else: enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions) if num_dec_partitions <= 1: dec_partitioner = None else: dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions) if src_embed_file and enc_partitioner: raise ValueError( "Can't set num_enc_partitions > 1 when using pretrained encoder " "embedding") if tgt_embed_file and dec_partitioner: raise ValueError( "Can't set num_dec_partitions > 1 when using pretrained decdoer " "embedding") with tf.variable_scope( scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError("Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed( "embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: if not use_char_encode: with tf.variable_scope("encoder", partitioner=enc_partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) else: embedding_encoder = None with tf.variable_scope("decoder", partitioner=dec_partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) return embedding_encoder, embedding_decoder
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "lstm_cell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, h_prev) = state else: c_prev = tf.slice(state, begin=[0, 0], size=[-1, self._num_units]) h_prev = tf.slice(state, begin=[0, self._num_units], size=[-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with tf.variable_scope(self, scope or "lstm_cell", initializer=self._initializer, reuse=self._reuse) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( tf.fixed_size_partitioner(self._num_unit_shards)) # i = input_gate, g = new_input, f = forget_gate, o = output_gate lstm_matrix = tf.contrib.rnn._linear([inputs, h_prev], 4 * self._num_units, bias=True) i, g, f, o = tf.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: # tf.variable_scopeとtf.get_variableはセットで使う with tf.variable_scope(unit_scope) as projection_scope: if self._num_unit_shards is not None: projection_scope.set_partitioner(None) w_f_diag = tf.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) w_i_diag = tf.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) w_o_diag = tf.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (tf.sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + tf.sigmoid(i + w_i_diag * c_prev) * tf.tanh(g)) else: c = (tf.sigmoid(f + self._forget_bias) * c_prev + tf.sigmoid(i) * tf.tanh(g)) if self._cell_clip is not None: c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: h = tf.sigmoid(o + w_o_diag * c) * tf.tanh(c) else: h = tf.sigmoid(o) * tf.tanh(c) if self._num_proj is not None: with tf.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( tf.fixed_size_partitioner(self._num_proj_shards)) h = tf.contrib.rnn._linear(h, self._num_proj, bias=False) if self._proj_clip is not None: h = tf.clip_by_value(h, -self._proj_clip, self._proj_clip) new_state = (LSTMStateTuple(c, h) if self._state_is_tuple else tf.concat([c, h], 1)) return h, new_state
def train_criteo(model, cluster, task_id, nrank, args): def get_current_shard(data): part_size = data.shape[0] // nrank start = part_size * task_id end = start + part_size if task_id != nrank - 1 else data.shape[0] return data[start:end] if args.all: from models.load_data import process_all_criteo_data dense, sparse, all_labels = process_all_criteo_data() dense_feature = get_current_shard(dense[0]) sparse_feature = get_current_shard(sparse[0]) labels = get_current_shard(all_labels[0]) val_dense = get_current_shard(dense[1]) val_sparse = get_current_shard(sparse[1]) val_labels = get_current_shard(all_labels[1]) else: from models.load_data import process_sampled_criteo_data dense_feature, sparse_feature, labels = process_sampled_criteo_data() dense_feature = get_current_shard(dense_feature) sparse_feature = get_current_shard(sparse_feature) labels = get_current_shard(labels) batch_size = 128 worker_device = "/job:worker/task:%d/gpu:0" % (task_id) with tf.device(worker_device): dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)): server_num = len(cluster.as_dict()['ps']) # print('this is server num:', server_num) embed_partitioner = tf.fixed_size_partitioner( server_num, 0) if server_num > 1 else None loss, y, opt = model(dense_input, sparse_input, y_, embed_partitioner) train_op = opt.minimize(loss) server = tf.train.Server(cluster, job_name="worker", task_index=task_id) init = tf.compat.v1.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(task_id == 0), init_op=init, recovery_wait_secs=1) sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_id]) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # sess.run(init) if task_id == 0: writer = tf.compat.v1.summary.FileWriter('logs/board', sess.graph) my_feed_dict = { dense_input: np.empty(shape=(batch_size, 13)), sparse_input: np.empty(shape=(batch_size, 26)), y_: np.empty(shape=(batch_size, 1)), } if args.all: raw_log_file = './logs/tf_dist_%s_%d.log' % (args.model, task_id) print('Processing all data, log to', raw_log_file) log_file = open(raw_log_file, 'w') iterations = dense_feature.shape[0] // batch_size total_epoch = 11 start_index = 0 for ep in range(total_epoch): # print("iters: %d" % (lp * 1000)) print("epoch %d" % ep) st_time = time.time() train_loss, train_acc, train_auc = [], [], [] for it in range(iterations // 10 + (ep % 10 == 9) * (iterations % 10)): my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] start_index += batch_size if start_index + batch_size > dense_feature.shape[0]: start_index = 0 loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(true_val, pred_val > 0.5) train_loss.append(loss_val[0]) train_acc.append(acc_val) train_auc.append(metrics.roc_auc_score(true_val, pred_val)) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) tra_auc = np.mean(train_auc) en_time = time.time() train_time = en_time - st_time if args.val: val_loss, val_acc, val_auc = [], [], [] for it in range(val_dense.shape[0] // batch_size): local_st = it * batch_size my_feed_dict[dense_input][:] = val_dense[ local_st:local_st + batch_size] my_feed_dict[sparse_input][:] = val_sparse[ local_st:local_st + batch_size] my_feed_dict[y_][:] = val_labels[local_st:local_st + batch_size] loss_val = sess.run([loss, y, y_], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(true_val, pred_val > 0.5) val_loss.append(loss_val[0]) val_acc.append(acc_val) val_auc.append(metrics.roc_auc_score(true_val, pred_val)) v_accuracy = np.mean(val_acc) v_loss = np.mean(val_loss) v_auc = np.mean(val_auc) printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\ % (tra_loss, tra_accuracy, tra_auc, v_loss, v_accuracy, v_auc, train_time) else: printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ % (tra_loss, tra_accuracy, tra_auc, train_time) print(printstr) log_file.write(printstr + '\n') log_file.flush() else: # here no val iteration = dense_feature.shape[0] // batch_size epoch = 10 for ep in range(epoch): print('epoch', ep) if ep == 5: start = time.time() ep_st = time.time() train_loss = [] train_acc = [] for idx in range(iteration): start_index = idx * batch_size my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] if pred_val.shape[1] == 1: # for criteo case acc_val = np.equal(true_val, pred_val > 0.5) else: acc_val = np.equal(np.argmax(pred_val, 1), np.argmax(true_val, 1)).astype(np.float) train_loss.append(loss_val[0]) train_acc.append(acc_val) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) ep_en = time.time() print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" % (tra_loss, tra_accuracy, ep_en - ep_st)) print("tensorflow: ", (time.time() - start))
def build_model(self): self.X = tf.placeholder(tf.int64, [None, None], name='input') # batch * 序列长度 self.Y = tf.placeholder(tf.int64, [None, None], name='output') # batch * 序列长度 self.seq_len = tf.placeholder(tf.int64, [None], name="seq_len") self.drop_out = tf.placeholder(tf.float32, name="drop_out") self.global_step = tf.Variable(0, name='global_step', trainable=False) self.batch_size = tf.shape(self.X)[0] with tf.variable_scope('gru_layer'): sigma = self.sigma if self.sigma != 0 else np.sqrt(6.0 / (self.n_items + self.rnn_size)) if self.init_as_normal: initializer = tf.random_normal_initializer(mean=0, stddev=sigma) else: initializer = tf.random_uniform_initializer(minval=-sigma, maxval=sigma) partitioner = tf.fixed_size_partitioner(num_shards=self.rnn_size) embedding = tf.get_variable('embedding', [self.n_items, self.rnn_size], tf.float32, initializer=initializer, partitioner=partitioner) softmax_W = tf.get_variable('softmax_w', [self.n_items, self.rnn_size], tf.float32, initializer=initializer, partitioner=partitioner) softmax_b = tf.get_variable('softmax_b', [self.n_items], tf.float32, initializer=tf.constant_initializer(0.0), partitioner=partitioner) cell = tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(self.rnn_size) initial_state = cell.zero_state(self.batch_size, dtype=tf.float32) drop_cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.drop_out) inputs = tf.nn.embedding_lookup(embedding, self.X) # batch * 序列长度 * rnn_size output, state = tf.nn.dynamic_rnn(drop_cell, inputs, initial_state=initial_state, dtype=tf.float32, sequence_length=self.seq_len) self.output = output # batch * 序列长度 * rnn_size self.state = state # batch * rnn_size(最后一个序列的状态) ''' 训练 Use other examples of the minibatch as negative samples. ''' self.sampled_W = tf.nn.embedding_lookup(softmax_W, self.Y, name='sampled_W') # batch * 序列长度 * rnn_size self.sampled_b = tf.nn.embedding_lookup(softmax_b, self.Y, name='sampled_b') # batch * 序列长度 # logits_train的shape: batch * 序列长度 * batch self.logits_train = tf.transpose(tf.matmul(tf.transpose(output, [1, 0, 2]), tf.transpose(self.sampled_W, [1, 2, 0])), [1, 0, 2]) + tf.expand_dims(self.sampled_b, -1) self.yhat_train = self.final_activation(self.logits_train, "yhat_train") # batch * 序列长度 * batch self.cost_train = self.loss_function(self.yhat_train) ''' 预测 ''' output_shape = tf.shape(output) # output shape: batch * 序列长度 * rnn_size softmax_WT = tf.transpose(softmax_W) # rnn_size * n_items swt_shape = tf.shape(softmax_WT) re_softmax = tf.reshape(tf.tile(softmax_WT, [output_shape[0], 1]), [output_shape[0], swt_shape[0], swt_shape[1]]) self.logits_predict = tf.matmul(output, re_softmax) + softmax_b # batch * 序列长度 * n_items self.yhat_predict = self.final_activation(self.logits_predict, "yhat_predict") ''' 学习率 ''' self.lr = tf.maximum(1e-5, tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay_rate, staircase=True)) ''' Try different optimizers. ''' # optimizer = tf.train.AdagradOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) # optimizer = tf.train.AdadeltaOptimizer(self.lr) # optimizer = tf.train.RMSPropOptimizer(self.lr) tvars = tf.trainable_variables() gvs = optimizer.compute_gradients(self.cost_train, tvars) if self.grad_cap > 0: capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var) for (grad, var) in gvs] else: capped_gvs = gvs self.train_op = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
def _make_model(target_words, context_words, mode): index_tensor = tf.constant(index) reverse_index = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer( index_tensor, tf.constant(range(vocab_size - 1), dtype=tf.int64) ), vocab_size - 1 ) # tf.contrib.learn.Estimator.fit adds an addition dimension to input target_words_squeezed = tf.squeeze(target_words, squeeze_dims=[1]) target_indices = reverse_index.lookup(target_words_squeezed) with tf.device(tf.train.replica_device_setter()): with tf.variable_scope('nce', partitioner=tf.fixed_size_partitioner( num_partitions)): embeddings = tf.get_variable( 'embeddings', shape=[vocab_size, embedding_size], dtype=tf.float32, initializer=tf.random_uniform_initializer(-1.0, 1.0) ) if mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: nce_weights = tf.get_variable( 'nce_weights', shape=[vocab_size, embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=1.0 / math.sqrt(embedding_size) ) ) nce_biases = tf.get_variable( 'nce_biases', initializer=tf.zeros_initializer([vocab_size]), dtype=tf.float32 ) prediction_dict, loss, train_op = ({}, None, None) if mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: context_indices = tf.expand_dims( reverse_index.lookup(context_words), 1) embedded = tf.nn.embedding_lookup(embeddings, target_indices) sampled_words = tf.nn.fixed_unigram_candidate_sampler( true_classes=context_indices, num_true=1, num_sampled=num_sampled, unique=True, range_max=vocab_size, distortion=0.75, unigrams=vocab_counts + [1] ) loss = tf.reduce_mean(tf.nn.nce_loss( nce_weights, nce_biases, embedded, context_indices, num_sampled, vocab_size, sampled_values=sampled_words )) tf.scalar_summary('loss', loss) if mode == ModeKeys.TRAIN: train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss, global_step=tf.contrib.framework.get_global_step() ) if mode in [ModeKeys.EVAL, ModeKeys.INFER]: # Compute the cosine similarity between examples and embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup( normalized_embeddings, target_indices) similarity = tf.matmul( valid_embeddings, normalized_embeddings, transpose_b=True) prediction_dict['values'], predictions = tf.nn.top_k( similarity, sorted=True, k=num_sim) index_tensor = tf.concat(0, [index_tensor, tf.constant(['UNK'])]) prediction_dict['predictions'] = tf.gather(index_tensor, predictions) return prediction_dict, loss, train_op
def _model_fn(inputs, context_words, mode): target_words = inputs['targets'] index_tensor = inputs['index'] reverse_index = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer( index_tensor, tf.constant(range(1, args.vocab_size), dtype=tf.int64) ), 0 ) # tf.contrib.learn.Estimator.fit adds an addition dimension to input target_words_squeezed = tf.squeeze(target_words, squeeze_dims=[1]) target_indices = reverse_index.lookup(target_words_squeezed) with tf.device(tf.train.replica_device_setter()): with tf.variable_scope('nce', partitioner=tf.fixed_size_partitioner( args.num_partitions)): embeddings = tf.get_variable( 'embeddings', shape=[args.vocab_size, args.embedding_size], dtype=tf.float32, initializer=tf.random_uniform_initializer(-1.0, 1.0) ) if mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: nce_weights = tf.get_variable( 'nce_weights', shape=[args.vocab_size, args.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=1.0 / math.sqrt(args.embedding_size) ) ) nce_biases = tf.get_variable( 'nce_biases', initializer=tf.zeros_initializer([args.vocab_size]), dtype=tf.float32 ) tensors, loss, train_op = ({}, None, None) if mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: context_indices = tf.expand_dims( reverse_index.lookup(context_words), 1) embedded = # TODO loss = # TODO tf.scalar_summary('loss', loss) tf.scalar_summary('training/hptuning/metric', loss) if mode == ModeKeys.TRAIN: train_op = #TODO (pick an Optimizer).minimize( loss, global_step=tf.contrib.framework.get_global_step() ) if mode == ModeKeys.INFER: # Compute the cosine similarity between examples and embeddings. # TODO similarity = tensors['values'], predictions = tf.nn.top_k( similarity, sorted=True, k=args.num_sim) index_tensor = tf.concat(0, [tf.constant(['UNK']), index_tensor]) tensors['predictions'] = tf.gather(index_tensor, predictions)
def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, word_embed, dtype=tf.float32, num_partitions=0, scope=None): """Create embedding matrix for both encoder and decoder. Args: share_vocab: A boolean. Whether to share embedding matrix for both encoder and decoder. src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_partitions: number of partitions used for the embedding vars. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if use share_vocab but source and target have different vocab size. """ if num_partitions <= 1: partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. partitioner = tf.fixed_size_partitioner(num_partitions) with tf.variable_scope(scope or "embeddings", dtype=dtype, partitioner=partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError( "Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) utils.print_out("# Use the same source embeddings for target") if word_embed == "None": utils.print_out( "Using default word embedding. (one-hot based)", new_line=True) embedding = tf.get_variable("embedding_share", [src_vocab_size, src_embed_size], dtype) elif "word2vec" in word_embed or "glove" in word_embed: utils.print_out("Loading word embedding: %s" % word_embed, new_line=True) word2vec_emd = np.load(word_embed + ".pickle") embedding = tf.get_variable( name="embedding_share", shape=[src_vocab_size, src_embed_size], dtype=dtype, initializer=tf.constant_initializer(word2vec_emd), trainable=True) else: embedding = None embedding_encoder = embedding embedding_decoder = embedding else: with tf.variable_scope("encoder", partitioner=partitioner): embedding_encoder = tf.get_variable( "embedding_encoder", [src_vocab_size, src_embed_size], dtype) with tf.variable_scope("decoder", partitioner=partitioner): embedding_decoder = tf.get_variable( "embedding_decoder", [tgt_vocab_size, tgt_embed_size], dtype) return embedding_encoder, embedding_decoder
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) configP = tf.ConfigProto() server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id, config=configP) batchSizeManager = BatchSizeManager(FLAGS.batch_size, len(worker_hosts)) if FLAGS.job_name == 'ps': rpcServer = batchSizeManager.create_rpc_server( ps_hosts[0].split(':')[0]) rpcServer.serve() server.join() rpcClient = batchSizeManager.create_rpc_client(ps_hosts[0].split(':')[0]) is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000 * 350.0 / FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) re = tf.shape(images)[0] with tf.variable_scope('root', partitioner=tf.fixed_size_partitioner( len(ps_hosts), axis=0)): network = resnet_model.cifar10_resnet_v2_generator( FLAGS.resnet_size, _NUM_CLASSES) inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) # labels = tf.reshape(labels, [-1, _NUM_CLASSES]) print(labels.get_shape()) labels = tf.one_hot(labels, 10, 1, 0) print(labels.get_shape()) logits = network(inputs, True) print(logits.get_shape()) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # logits = cifar10.inference(images, batch_size) # loss = cifar10.loss(logits, labels, batch_size) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), # replica_id=FLAGS.task_id, total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) grads0 = opt.compute_gradients(loss) grads = [(tf.scalar_mul( tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() # saver = tf.train.Saver() sv = tf.train.Supervisor( is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, # saver=saver, saver=None, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, log_device_placement=FLAGS.log_device_placement) sess_config.gpu_options.allow_growth = True # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # sess.run(tf.global_variables_initializer()) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) """Train CIFAR-10 for a number of steps.""" time0 = time.time() batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # batch_size_num = updated_batch_size_num if step <= 5: batch_size_num = FLAGS.batch_size if step >= 0: batch_size_num = int(step / 5) % 512 + 1 batch_size_num = 128 num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # mgrads, images_, train_val, real, loss_value, gs = sess.run([grads, images, train_op, re, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) _, loss_value, gs = sess.run( [train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) # _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num}) b = time.time() # tl = timeline.Timeline(run_metadata.step_stats) # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') # thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) # thread.start() c0 = time.time() # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) batch_size_num = rpcClient.update_batch_size( FLAGS.task_id, 0, 0, 0, step, batch_size_num) if step % 1 == 0: duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) c = time.time() ## tf.logging.info("time statistics - batch_process_time: " + str( last_batch_time) + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0)) format_str = ( "time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch) - batch_size: ' + str(batch_size_num)) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
def create_model(self, filename_queue): optimizer = FLAGS.optimizer hash_bucket_size = FLAGS.hash_bucket_size nloop = FLAGS.nloop with_dependency = FLAGS.with_dependency poisson = FLAGS.poisson if FLAGS.is_validation: poisson = 0.0 ps_count = self.get_cluster_ps_count() with tf.variable_scope("ads_lr_input"): reader = tf.TFRecordReader() with tf.variable_scope("ads_lr_model"): hash_table = tf.get_variable("HashTable", [hash_bucket_size, 3], initializer=tf.zeros_initializer, dtype=tf.int32, trainable=False, partitioner=tf.fixed_size_partitioner(ps_count)) W_weights = tf.get_variable("W_weights", [hash_bucket_size], initializer=tf.zeros_initializer, dtype=tf.float32, partitioner=tf.fixed_size_partitioner(ps_count)) global_step = tf.Variable(0, name="global_step", trainable=False) # --------------------------------- debug hash_table_list = list(hash_table) for v in hash_table_list: print("name:", v.name, "device:", v.device, "shape:", v.get_shape()) # --------------------------------- debug if optimizer == "AdaGrad": optimizer_op = tf.train.AdagradOptimizer(FLAGS.learning_rate) elif optimizer == "FTRL": optimizer_op = tf.train.FtrlOptimizer(learning_rate=FLAGS.learning_rate, l1_regularization_strength=FLAGS.l1, l2_regularization_strength=FLAGS.l2) else: raise ValueError("Unrecognized optimizer type" + optimizer) # patch optimizer _apply_sparse_duplicate_indices if FLAGS.disable_sparse_grad_unique: optimizer_op._apply_sparse_duplicate_indices = optimizer_op._apply_sparse def one_mini_batch(batch_index, dependencies): # read one minibatch data with ops.control_dependencies(dependencies): _, batch_example = reader.read_up_to(filename_queue, FLAGS.batch_size) sample_labels, sample_weights, sample_guids, feature_indices, feature_values, feature_shape = raw_key_ops.parse_lr_samples(batch_example) sample_labels = tf.reshape(sample_labels, [-1, 1]) sample_weights = tf.reshape(sample_weights, [-1, 1]) sample_guids = tf.reshape(sample_guids, [-1, 1]) with tf.device("/cpu:0"): before_sigmoid = hash_embedding_ops.hash_embedding_lookup_sparse(hash_table, W_weights, feature_indices, feature_values, feature_shape, poisson=poisson) before_sigmoid = tf.reshape(before_sigmoid, [-1, 1]) pred = tf.nn.sigmoid(before_sigmoid) unweighted_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=before_sigmoid, labels=sample_labels) final_loss = tf.multiply(sample_weights, unweighted_loss) if FLAGS.use_reduce_sum: loss_fn = tf.reduce_sum(final_loss) else: loss_fn = tf.reduce_mean(final_loss) train_op = optimizer_op.minimize(loss_fn, global_step=global_step) if with_dependency: return [loss_fn], train_op, pred, loss_fn, sample_labels, sample_weights, sample_guids else: return [], train_op, pred, loss_fn, sample_labels, sample_weights, sample_guids dependencies = [] train_op_array = [] predictions = [] loss_fns = [] sample_labels_array = [] sample_weights_array = [] sample_guids_array = [] for i in range(nloop): dependency, train_op, prediction, loss_fn, sample_labels, sample_weights, sample_guids = one_mini_batch(i, None if i == 0 or not with_dependency else dependencies[-1]) dependencies.append(dependency) train_op_array.append(train_op) predictions.append(prediction) loss_fns.append(loss_fn) sample_labels_array.append(sample_labels) sample_weights_array.append(sample_weights) sample_guids_array.append(sample_guids) train_ops = tf.group(*train_op_array) weight_save_ops, weight_restore_ops = util.save_model_for_raw_key( FLAGS.init_model_dir, FLAGS.model_dir, "lr_weights", hash_table, W_weights, optimizer_op, FLAGS.output_optimizer_slots) # define custom saver for raw key self._save_op = tf.tuple(weight_save_ops) self._restore_op = tf.tuple(weight_restore_ops) if weight_restore_ops else [] self._labels = sample_labels_array self._weights = sample_weights_array self._train_op = train_ops self._predictions = predictions self._loss_fn = loss_fns self._global_step = global_step self._guids = sample_guids_array
def train(): """Train Inception on a dataset for a number of steps.""" ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') tf.logging.info('PS hosts are: %s' % ps_hosts) tf.logging.info('Worker hosts are: %s' % worker_hosts) cluster_spec = tf.train.ClusterSpec({ 'ps': ps_hosts, 'worker': worker_hosts }) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id, protocol=FLAGS.protocol) batchSizeManager = BatchSizeManager(FLAGS.batch_size, len(worker_hosts)) if FLAGS.job_name == 'ps': if FLAGS.task_id == 0: rpcServer = batchSizeManager.create_rpc_server( ps_hosts[0].split(':')[0]) rpcServer.serve() server.join() dataset = ImagenetData(subset=FLAGS.subset) rpcClient = batchSizeManager.create_rpc_client(ps_hosts[0].split(':')[0]) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, ( ' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) #batchSizeManager = BatchSizeManager(32, 4) # Ops are assigned to worker by default. tf.logging.info('cccc-num_parameter_servers:' + str(num_parameter_servers)) partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0) device_setter = tf.train.replica_device_setter( ps_tasks=num_parameter_servers) slim = tf.contrib.slim with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.variable_scope('root', partitioner=partitioner): # Variables and its related init/assign ops are assigned to ps. # with slim.arg_scope( # [slim.variables.variable, slim.variables.global_step], # device=slim.variables.VariableDeviceChooser(num_parameter_servers)): with tf.device(device_setter): # partitioner=partitioner): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. # global_step = slim.variables.global_step() global_step = tf.Variable(0, trainable=False) # Calculate the learning rate schedule. batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay( FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. # tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) print(images.get_shape()) print(labels.get_shape()) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. # num_classes = dataset.num_classes() + 1 num_classes = dataset.num_classes() print(num_classes) # logits = inception.inference(images, num_classes, for_training=True) network_fn = nets_factory.get_network_fn( 'inception_v3', num_classes=num_classes) (logits, _) = network_fn(images) print(logits.get_shape()) # Add classification loss. # inception.loss(logits, labels, batch_size) # Gather all of the losses including regularization losses. labels = tf.one_hot(labels, 1000, 1, 0) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) # losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # total_loss = tf.add_n(losses, name='total_loss') loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) with tf.control_dependencies([loss_averages_op]): opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) grads0 = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul( tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] total_loss = tf.identity(total_loss) exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = exp_moving_averager.apply( tf.trainable_variables()) apply_gradients_op = opt.apply_gradients( grads, global_step=global_step) with tf.control_dependencies( [apply_gradients_op, variables_averages_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners and init_tokens, which is used to synchronize # replicas. More details can be found in SyncReplicasOptimizer. # chief_queue_runners = [opt.get_chief_queue_runner()] # init_tokens_op = opt.get_init_tokens_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. # summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor( is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=None, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) # if is_chief: # sv.start_queue_runners(sess, chief_queue_runners) # sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. # next_summary_time = time.time() + FLAGS.save_summaries_secs step = 0 time0 = time.time() batch_size_num = FLAGS.batch_size while not sv.should_stop(): try: start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() my_images, loss_value, step = sess.run( [images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() # assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time c0 = time.time() # call rrsp mechanism to coordinate the synchronization order and update the batch size batch_size_num = rpcClient.update_batch_size( FLAGS.task_id, 0, 0, 0, step, batch_size_num) # ctf = tl.generate_chrome_trace_format() # with open("timeline.json", 'a') as f: # f.write(ctf) if step % 1 == 0: examples_per_sec = FLAGS.batch_size / float( duration) c = time.time() tf.logging.info("time statistics" + " - train_time: " + str(b - start_time) + " - get_batch_time: " + str(c0 - b) + " - get_bs_time: " + str(c - c0) + " - accum_time: " + str(c - time0) + " - batch_size: " + str(batch_size_num)) format_str = ( 'Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info( format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. # if is_chief and next_summary_time < time.time(): # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. # next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info( 'Chief got exception while running!') raise # Stop the supervisor. This also waits for service threads to finish. sv.stop()
def get_all_embeddings(params, dtype=tf.float32, scope=None): if params["lang1_partitions"] <= 1: lang1_partitioner = None else: lang1_partitioner = tf.fixed_size_partitioner( params["lang1_partitions"]) if params["lang2_partitions"] <= 1: lang2_partitioner = None else: lang2_partitioner = tf.fixed_size_partitioner( params["lang2_partitions"]) encoder_embeddings = {} decoder_embeddings = {} lang1_emb_np, lang2_emb_np = None, None if params["lang1_embed_file"] and params["lang2_embed_file"]: lang1_emb_np = _create_pretrained_emb_from_txt( params["lang1_vocab_file"], params["lang1_embed_file"]) if params["lang1_embed_file"] == params["lang2_embed_file"]: lang2_emb_np = lang1_emb_np else: lang2_emb_np = _create_pretrained_emb_from_txt( params["lang2_vocab_file"], params["lang2_embed_file"]) if params["share_decpro_emb"]: if params["share_lang_emb"]: assert params["share_output_emb"] share_bias = tf.get_variable('share_projection/bias', [ params["lang1_vocab_size"], ], initializer=tf.zeros_initializer()) pro_embs = { params["lang1"]: share_bias, params["lang2"]: share_bias } else: pro_embs = { params["lang1"]: tf.get_variable('bias', [ params["lang1_vocab_size"], ], initializer=tf.zeros_initializer()), params["lang2"]: tf.get_variable('bias', [ params["lang2_vocab_size"], ], initializer=tf.zeros_initializer()) } else: if params["share_output_emb"]: assert params["share_lang_emb"] if params["pretrained_out"]: assert params["lang1_embed_file"] == params["lang2_embed_file"] misc_utils.print_out( "# Using pre-trained embedding to initialize shared projection kernel." ) share_proj_layer = tf.layers.Dense( params["lang1_vocab_size"], use_bias=True, kernel_initializer=tf.constant_initializer( lang1_emb_np.transpose()), name="share_projection") else: share_proj_layer = tf.layers.Dense(params["lang1_vocab_size"], use_bias=True, name="share_projection") pro_embs = { params["lang1"]: share_proj_layer, params["lang2"]: share_proj_layer } else: if params["pretrained_out"]: misc_utils.print_out( "# Using pre-trained embedding to initialize two projection kernels." ) pro_embs = { params["lang1"]: tf.layers.Dense(params["lang1_vocab_size"], use_bias=True, kernel_initializer=tf.constant_initializer( lang1_emb_np.transpose()), name="%s_projection" % params["lang1"]), params["lang2"]: tf.layers.Dense(params["lang2_vocab_size"], use_bias=True, kernel_initializer=tf.constant_initializer( lang2_emb_np.transpose()), name="%s_projection" % params["lang2"]) } else: pro_embs = { params["lang1"]: tf.layers.Dense(params["lang1_vocab_size"], use_bias=True, name="%s_projection" % params["lang1"]), params["lang2"]: tf.layers.Dense(params["lang2_vocab_size"], use_bias=True, name="%s_projection" % params["lang2"]) } with tf.variable_scope(scope or "all_embeddings", dtype=dtype) as scope: # encoder embeddings with tf.variable_scope("encoder", partitioner=lang1_partitioner): lang = "share" if params["share_lang_emb"] else params["lang1"] lang1_enc_embedding = _create_embed("%s_embedding" % lang, params["lang1_vocab_size"], params["hidden_size"], dtype, lang1_emb_np) if params["share_lang_emb"]: if params["lang1_vocab_size"] != params["lang2_vocab_size"]: raise ValueError( "Share embedding but different vocab sizes" " %d vs. %d" % (params["lang1_vocab_size"], params["lang2_vocab_size"])) assert params["lang1_vocab_size"] == params["lang2_vocab_size"] misc_utils.print_out( "# Use the same encoder embedding for both languages.") lang2_enc_embedding = lang1_enc_embedding else: with tf.variable_scope("encoder", partitioner=lang2_partitioner): lang2_enc_embedding = _create_embed( "%s_embedding" % params["lang2"], params["lang2_vocab_size"], params["hidden_size"], dtype, lang2_emb_np) encoder_embeddings[params["lang1"]] = lang1_enc_embedding encoder_embeddings[params["lang2"]] = lang2_enc_embedding # decoder embeddings if params["share_encdec_emb"]: misc_utils.print_out( "# Use the same embedding for encoder and decoder of each language." ) decoder_embeddings = encoder_embeddings else: with tf.variable_scope("decoder", partitioner=lang1_partitioner): lang = "share" if params["share_lang_emb"] else params["lang1"] lang1_dec_embedding = _create_embed("%s_embedding" % lang, params["lang1_vocab_size"], params["hidden_size"], dtype, lang1_emb_np) if params["share_lang_emb"]: misc_utils.print_out( "# Use the same decoder embedding for both languages.") lang2_dec_embedding = lang1_dec_embedding else: lang2_dec_embedding = _create_embed( "%s_embedding" % params["lang2"], params["lang2_vocab_size"], params["hidden_size"], dtype, lang2_emb_np) decoder_embeddings[params["lang1"]] = lang1_dec_embedding decoder_embeddings[params["lang2"]] = lang2_dec_embedding return encoder_embeddings, decoder_embeddings, pro_embs
def model_fn(features, labels, mode, params): init_learning_rate = params['learning_rate'] decay_steps = params['decay_steps'] decay_rate = params['decay_rate'] with tf.name_scope('user'): # shape: B (batch size) user_embedding = fc.input_layer(features, [user_id, age, gender]) with tf.name_scope('item'): item_buckets = 100 item_id = features['item_id'] item_id = tf.reshape(item_id, [-1, 1]) list_size = tf.shape(item_id)[0] item_id = tf.string_to_hash_bucket_fast(item_id, num_buckets=item_buckets) # if matrix is huge, it can be distributed # item_matrix = tf.get_variable(name='item_matrix', # shape=(100, 16), # initializer=tf.initializers.glorot_uniform()) if mode != tf.estimator.ModeKeys.PREDICT: ps_num = len(params['tf_config']['cluster']['ps']) item_matrix = tf.get_variable(name='item_matrix', shape=(100, 16), initializer=tf.initializers.glorot_uniform(), partitioner=tf.fixed_size_partitioner(num_shards=ps_num)) #1 else: item_matrix = tf.get_variable(name='item_matrix', shape=(100, 16), initializer=tf.initializers.glorot_uniform()) item_embedding = tf.nn.embedding_lookup(item_matrix, item_id, name='item_embedding') item_embedding = tf.squeeze(item_embedding, axis=1) with tf.name_scope('history'): # shape: B * T (sequence length) clicked_items = features['clicked_items_15d'] clicked_mask = tf.cast(tf.not_equal(clicked_items, '0'), tf.bool) clicked_items = tf.string_to_hash_bucket_fast(clicked_items, num_buckets=item_buckets) # shape: B * T * E clicked_embedding = tf.nn.embedding_lookup(item_matrix, clicked_items, name='clicked_embedding') if mode == tf.estimator.ModeKeys.PREDICT: user_embedding = tf.tile(user_embedding, [list_size, 1]) clicked_embedding = tf.tile(clicked_embedding, [list_size, 1, 1]) clicked_mask = tf.tile(clicked_mask, [list_size, 1]) # shape: B * E clicked_attention = attention(clicked_embedding, item_embedding, clicked_mask, [16, 8], 'clicked_attention') fc_inputs = tf.concat([user_embedding, item_embedding, clicked_attention], axis=-1, name='fc_inputs') with tf.name_scope('predictions'): logits = fc_layers(mode, net=fc_inputs, hidden_units=[64, 16, 1], dropout=0.3) predictions = tf.sigmoid(logits, name='predictions') if mode != tf.estimator.ModeKeys.PREDICT: labels = tf.reshape(labels, [-1, 1]) loss = tf.losses.sigmoid_cross_entropy(labels, logits) if mode == tf.estimator.ModeKeys.EVAL: metrics = { 'auc': tf.metrics.auc(labels=labels, predictions=predictions, num_thresholds=500) } for metric_name, op in metrics.items(): tf.summary.scalar(metric_name, op[1]) return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) else: global_step = tf.train.get_global_step() learning_rate = exponential_decay(global_step, init_learning_rate, decay_steps, decay_rate) optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate) tf.summary.scalar('learning_rate', learning_rate) train_op = optimizer.minimize(loss=loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) else: predictions = { 'probability': tf.reshape(predictions, [1, -1]) } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
def _model_fn(inputs, context_indices, mode): if mode == ModeKeys.INFER: sparse_index_tensor = tf.string_split( [tf.read_file(vocab_file)], delimiter='\n' ) index_tensor = tf.squeeze(tf.sparse_to_dense( sparse_index_tensor.indices, [1, vocab_size], sparse_index_tensor.values, default_value='UNK' )) reverse_index = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer( index_tensor, tf.constant(range(vocab_size), dtype=tf.int64) ), 0 ) target_indices = reverse_index.lookup(inputs) else: target_indices = inputs with tf.device(tf.train.replica_device_setter()): with tf.variable_scope('nce', partitioner=tf.fixed_size_partitioner( num_partitions)): embeddings = tf.get_variable( 'embeddings', shape=[vocab_size, embedding_size], dtype=tf.float32, initializer=tf.random_uniform_initializer(-1.0, 1.0) ) if mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: nce_weights = tf.get_variable( 'nce_weights', shape=[vocab_size, embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=1.0 / math.sqrt(embedding_size) ) ) nce_biases = tf.get_variable( 'nce_biases', initializer=tf.zeros_initializer([vocab_size]), dtype=tf.float32 ) tensors, loss, train_op = ({}, None, None) if mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: embedded = tf.nn.embedding_lookup(embeddings, target_indices) loss = tf.reduce_mean(tf.nn.nce_loss( nce_weights, nce_biases, embedded, context_indices, num_sampled, vocab_size )) tf.summary.scalar('loss', loss) tf.summary.scalar('training/hptuning/metric', loss) # Embedding Visualizer embedding_writer = tf.summary.FileWriter(output_path) config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = embeddings.name embedding.metadata_path = vocab_file projector.visualize_embeddings(embedding_writer, config) if mode == ModeKeys.TRAIN: train_op = tf.train.GradientDescentOptimizer( learning_rate ).minimize( loss, global_step=tf.contrib.framework.get_or_create_global_step() ) if mode == ModeKeys.INFER: # Compute the cosine similarity between examples and embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup( normalized_embeddings, tf.squeeze(target_indices)) similarity = tf.matmul( valid_embeddings, normalized_embeddings, transpose_b=True) tensors['values'], predictions = tf.nn.top_k( similarity, sorted=True, k=num_sim) index_tensor = tf.concat(0, [tf.constant(['UNK']), index_tensor]) tensors['predictions'] = tf.gather(index_tensor, predictions) return tensors, loss, train_op
def build_inference(self, x, flag="train"): # 设置regularizer,本别对应网络的四个部分 regularizer1 = self.param_dict[ "regulerizer1"] if flag == "train" else None regularizer2 = self.param_dict[ "regulerizer2"] if flag == "train" else None regularizer3 = self.param_dict[ "regulerizer3"] if flag == "train" else None regularizer4 = self.param_dict[ "regulerizer4"] if flag == "train" else None is_train = True if flag == "train" else False # 先获取需要的参数 hash_size = self.param_dict['hash_size'] no_hash = self.param_dict["no_hash"] embed_size = self.param_dict["embed_size"] # browse_nums = self.param_dict["browse_nums"] # browse_nums = [20, 10, 10] # 根据配置获取激活函数 act_fn = self.get_activation_func(is_train) # 是否启用mini-batch aware regularization is_mba_reg = self.param_dict["is_mba_reg"] lambda_reg_mba = self.param_dict["lambda_reg_mba"] is_action_mba_reg = self.param_dict["is_action_mba_reg"] # 将输入划分 x_feature = x[:, :-3] x_action_lists = x[:, -3:] # 先将稀疏特征转换成indice x_sparse = [] for i in range(len(hash_size)): if i in no_hash: # 这部分特征本身可以直接作为indice,不需要转化 x_i = tf.string_to_number(x_feature[:, i], tf.int32) x_sparse.append(x_i) else: # 这部分特征可以通过哈希函数来转化成index x_i = tf.string_to_hash_bucket_strong( input=x_feature[:, i], num_buckets=hash_size[i], key=[679362, 964545], name="sparse_feature_{}".format(i)) x_sparse.append(x_i) # 将稀疏数据转换成embedding向量 x_embed = [] w_action_embed = [] x_action = [] indice_sku_cate_brand = [] sku_cate_brand_index = self.param_dict["sku_cate_brand_index"] for i in range(len(embed_size)): if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 with tf.variable_scope("embedding_{}".format(i)): weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) w_action_embed.append(weights) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if is_train and is_mba_reg and not is_action_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) indice_sku_cate_brand.append(x_sparse[i]) x_embed.append(x_i) x_action.append(x_i) else: if embed_size[i] != -1: with tf.variable_scope("embedding_{}".format(i)): if i == 0: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) else: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]])) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if is_train and is_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) x_embed.append(x_i) else: x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) x_embed.append(x_i) x_embed = tf.concat(x_embed, 1) x_deep_in = x_embed is_usingg_user_act_feature = self.param_dict[ "is_usingg_user_act_feature"] if is_usingg_user_act_feature: pooling_method = self.param_dict["pooling_method"] # 对浏览行为建模,构建行为embedding向量 with tf.name_scope("user_behaviours"): x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [ -1, ]) x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [ -1, ]) x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [ -1, ]) browse_lists = [ x_browse_skus_list, x_browse_cates_list, x_browse_brand_list ] browse_names = ['skus', 'cates', 'brands'] x_action_list_embeds = [] for i in range(len(browse_names)): with tf.name_scope("user_browse_{}_embedding".format( browse_names[i])): browse_w_embed = w_action_embed[i] # x_ad_embedded = x_action[i] x_browse_action = browse_lists[ i] # shape of x_browse_action is [?,] x_browse_action_list = tf.string_split( x_browse_action, "#") x_browse_action_list_indices = tf.SparseTensor( x_browse_action_list.indices, tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545], name="sparse_user_browse_{}".format( browse_names[i])), x_browse_action_list.dense_shape, ) x_action_list_embed = tf.nn.embedding_lookup_sparse( browse_w_embed, sp_ids=x_browse_action_list_indices, sp_weights=None, combiner=pooling_method) if is_train and is_action_mba_reg: # 计算mba indice_action = tf.concat([ tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545]), indice_sku_cate_brand[i] ], 0) self.calculate_mini_batch_aware_reg( browse_w_embed, indice_action, lambda_reg_mba) x_action_list_embeds.append(x_action_list_embed) x_deep_in = tf.concat( [x_deep_in, tf.concat(x_action_list_embeds, 1)], 1) # 构建deep模块 with tf.name_scope("deep_network"): deep_layers = self.param_dict["deep_layers"] for i in range(len(deep_layers)): with tf.variable_scope("dnn_layer_{}".format(i)): weights = self.get_weight_variable( [x_deep_in.shape[1].value, deep_layers[i]], regularizer2, self.param_dict["initializer_dnn_w"]( [x_deep_in.shape[1].value, deep_layers[i]])) biases = tf.get_variable( "biases", [deep_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases, name="deep_mlp_{}".format(i)) x_deep_in = layer_i # 构建输出模块full connect x_fc_in = x_deep_in with tf.name_scope("fc_layers"): fc_layers = self.param_dict['fc_layers'] for i in range(len(fc_layers)): with tf.variable_scope("fc_layers_{}".format(i)): weights = self.get_weight_variable( [x_fc_in.shape[1].value, fc_layers[i]], regularizer4, self.param_dict["initializer_fc_w"]( [x_fc_in.shape[1].value, fc_layers[i]])) biases = tf.get_variable( "biases", [fc_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = tf.nn.sigmoid( tf.matmul(x_fc_in, weights) + biases) x_fc_in = layer_i logit = x_fc_in return logit
def calc_vectors(self): with tf.variable_scope("calc_vectors", values=tuple(six.itervalues(self._features))): # in this simple case, create ClkI x itemId seq model ClkI_sparse = self._features["ClkI"] ClkI_seq_len = get_sequence_length(ClkI_sparse) itemId_dense = self._features["item_id"] itemId_dense = tf.reshape(itemId_dense, shape=[-1]) print("itemId_dense: ", itemId_dense) # step-1, hash raw features itemId_hash = tf.string_to_hash_bucket_fast( itemId_dense, self._itemId_hash_bucket_size) ClkI_val_hash = tf.string_to_hash_bucket_fast( ClkI_sparse.values, self._itemId_hash_bucket_size) print("itemId_hash: ", itemId_hash) print("ClkI_val_hash: ", ClkI_val_hash) # step-1.1, sparse_to_dense ClkI_hash = tf.sparse_to_dense(sparse_indices=ClkI_sparse.indices, output_shape=ClkI_sparse.dense_shape, sparse_values=ClkI_val_hash) # step-2, embedding lookup with tf.variable_scope("embedding_tables", reuse=False): _itemId_emb_shape = [self._itemId_hash_bucket_size, self._itemId_embedding_size] self._itemId_emb_tab = tf.contrib.framework.model_variable( name="item_id_embedding/weights", shape=_itemId_emb_shape, dtype=tf.float32, initializer=tf.truncated_normal_initializer( mean=0., stddev=1./math.sqrt(self._itemId_hash_bucket_size)), trainable=True, collections=[self._root_scope], partitioner=tf.fixed_size_partitioner(10, axis=0)) with tf.name_scope("embedding_lookup", values=(self._itemId_emb_tab, itemId_hash, ClkI_hash)): itemId_emb = tf.nn.embedding_lookup(self._itemId_emb_tab, itemId_hash) ClkI_emb = tf.nn.embedding_lookup(self._itemId_emb_tab, ClkI_hash) self._itemId_emb = itemId_emb # step-3, proccess sequence_to_vector def _atten_fn_mlp(a, b): _hidden_units = self._hidden_units with tf.variable_scope("atten_fn_linear", reuse=tf.AUTO_REUSE, values=(a,b)) as atten_fn_scope: size_a = a.shape[1] size_b = b.shape[1] c = tf.matmul( tf.reshape(a, shape=[-1, size_a, 1]), tf.reshape(b, shape=[-1, 1, size_b]), name="similarity") c = tf.reshape(c, shape=[-1, size_a * size_b]) net = tf.concat([a,b,c], axis=1) for layer_id, num_units in enumerate(_hidden_units): with tf.variable_scope( "hidden_layer_%d" % layer_id, values=(net,)) as hidden_layer_scope: net = tf.contrib.layers.fully_connected( net, num_units, activation_fn=tf.nn.relu, variables_collections=[self._root_scope], scope=hidden_layer_scope) _output = tf.contrib.layers.fully_connected( net, 1, activation_fn=tf.exp, variables_collections=[self._root_scope], scope=atten_fn_scope) return _output atten_params = { "proc_type": "atten", "target_values": self._itemId_emb, "atten_fn": _atten_fn_mlp, "normalize_weights": False, } outputs, ClkI_vec = sequence_to_vector( Seq(emb=ClkI_emb, seq_len=ClkI_seq_len), atten_params) self._clki_vec = ClkI_vec self._vectors = { "seq2vec_ClkI_vec": self._clki_vec, "seq2vec_itemId_vec": self._itemId_emb, } return self._vectors
def vgg_16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_16', fc_conv_padding='VALID', global_pool=False): """Oxford Net VGG 16-Layers version D Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer are returned instead. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. fc_conv_padding: the type of padding to use for the fully connected layer that is implemented as a convolutional layer. Use 'SAME' padding if you are applying the network in a fully convolutional manner and want to get a prediction map downsampled by a factor of 32 as an output. Otherwise, the output prediction map will be (input / 32) - 6 in case of 'VALID' padding. global_pool: Optional boolean flag. If True, the input to the classification layer is avgpooled to size 1x1, for any input size. (This is not part of the original VGG architecture.) Returns: net: the output of the logits layer (if num_classes is a non-zero integer), or the input to the logits layer (if num_classes is 0 or None). end_points: a dict of tensors with intermediate activations. """ #with tf.variable_scope(scope, 'vgg_16', [inputs]) partitioner = tf.fixed_size_partitioner(2, axis=0) # with tf.variable_scope(scope, 'vgg_16', [inputs], partitioner=partitioner) as sc: with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope( [slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. fc_conv_padding = 'same' net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict( end_points_collection) if global_pool: net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool') end_points['global_pool'] = net if num_classes: net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, scope=None): """Creating embedding matrix for both encoder and decoder Args: share_vocab: A boolean. Whether to share embedding matrix for encoder and decoder. src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for encoder's embedding. tgt_embed_size: dtype: dtype fo the embedding matrix. Default to tf.flotat32 num_partitions: #partitions used for embedding var. scope: VariableScope for created subgraph. Default to "embedding" Return: embedding_encoder: Encoder's embedding matrix embedding_decoder: Decoder's embedding matrix Raises: share_vocab=True, yet the src_vocab_size and tgt_vocab_size is different """ if num_partitions <= 1: partitioner = None else: partitioner = tf.fixed_size_partitioner(num_partitions) if (src_embed_file or tgt_embed_file) and partitioner: raise ValueError("can not set partitions >1 when use pretrained embeddings") with tf.variable_scope( scope or "embedding", dtype=dtype, partitioner=partitioner) as scope: if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError("share embedding but with different vocab size %d/%d"% (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("Share embedding") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed( "embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: with tf.variable_scope("encoder", partitioner=partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) with tf.variable_scope("decoder", partitioner=partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) return embedding_encoder, embedding_decoder
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are inferred from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) #batchSizeManager = BatchSizeManager(32, 4) # Ops are assigned to worker by default. tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers)) partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0) device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers) slim = tf.contrib.slim with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.variable_scope('root', partitioner=partitioner): # Variables and its related init/assign ops are assigned to ps. # with slim.arg_scope( # [slim.variables.variable, slim.variables.global_step], # device=slim.variables.VariableDeviceChooser(num_parameter_servers)): with tf.device(device_setter): # partitioner=partitioner): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. # global_step = slim.variables.global_step() global_step = tf.Variable(0, trainable=False) # Calculate the learning rate schedule. batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate*num_workers, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. # tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) print(images.get_shape()) print(labels.get_shape()) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. # num_classes = dataset.num_classes() + 1 num_classes = dataset.num_classes() print(num_classes) # logits = inception.inference(images, num_classes, for_training=True) network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) (logits,_) = network_fn(images) print(logits.get_shape()) # Add classification loss. # inception.loss(logits, labels, batch_size) # Gather all of the losses including regularization losses. labels = tf.one_hot(labels, 1000, 1, 0) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) # losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. # for l in losses + [total_loss]: # loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. # tf.summary.scalar(loss_name + ' (raw)', l) # tf.summary.scalar(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. # for var in variables_to_average: # tf.summary.histogram(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) # assert batchnorm_updates, 'Batchnorm updates are missing' # batchnorm_updates_op = tf.group(*batchnorm_updates) # # Add dependency to compute batchnorm_updates. # with tf.control_dependencies([batchnorm_updates_op]): # total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. # grads = opt.compute_gradients(total_loss) grads0 = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] # Add histograms for gradients. # for grad, var in grads: # if grad is not None: # tf.summary.histogram(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners and init_tokens, which is used to synchronize # replicas. More details can be found in SyncReplicasOptimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. # summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=None, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. # next_summary_time = time.time() + FLAGS.save_summaries_secs step = 0 time0 = time.time() batch_size_num = 1 while not sv.should_stop(): try: start_time = time.time() batch_size_num = 32 batch_size_num = 2*int(step/5)+16 # batch_size_num = int((int(step)/3*10)) % 100000 + 1 # if step < 5: # batch_size_num = 32 # batch_size_num = (batch_size_num ) % 64 + 1 # else: # batch_size_num = 80 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() # assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) thread.start() # tl = timeline.Timeline(run_metadata.step_stats) # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') c0 = time.time() # batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time) # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) # ctf = tl.generate_chrome_trace_format() # with open("timeline.json", 'a') as f: # f.write(ctf) if step % 1 == 0: examples_per_sec = FLAGS.batch_size / float(duration) c = time.time() tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num)) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. # if is_chief and next_summary_time < time.time(): # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. # next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('Chief got exception while running!') raise # Stop the supervisor. This also waits for service threads to finish. sv.stop()
def _initialize_parameters(self, hparams, ppm): K = np.float32(self.K) su, tu, a, b, self.size_u = (hparams['su'], hparams['tu'], hparams['a'], hparams['b'], hparams['size_u']) si, ti, c, d, self.size_i = (hparams['si'], hparams['ti'], hparams['c'], hparams['d'], hparams['size_i']) with tf.name_scope("hparams"), tf.device(self.device): ## Hyperparameters self.lsu = tf.Variable(softplus_inverse(-hparams['su'] + 1.), dtype=tf.float32, name="lsu") self.su = -tf.nn.softplus(self.lsu) + 1. self.tu = tf.Variable(hparams['tu'], dtype=tf.float32, name="tu") self.a = tf.Variable(hparams['a'], dtype=tf.float32, name="a") self.b = tf.Variable(hparams['b'], dtype=tf.float32, name="b") self.lsi = tf.Variable(softplus_inverse(-hparams['si'] + 1.), dtype=tf.float32, name="lsi") self.si = -tf.nn.softplus(self.lsi) + 1. self.ti = tf.Variable(hparams['ti'], dtype=tf.float32, name="ti") self.c = tf.Variable(hparams['c'], dtype=tf.float32, name="c") self.d = tf.Variable(hparams['d'], dtype=tf.float32, name="d") e = np.sum(self.edge_vals_d, dtype=np.float32) # initial values for total user and total item masses of type K # set st \sum_k tim_k * tum_k = e (which is in fact a bit higher than it oughta be) # and using item_mass / user_mass ~ item_size / user_size (which is only kind of true) tum_init = np.sqrt(self.size_u / self.size_i * e / K) tim_init = np.sqrt(self.size_i / self.size_u * e / K) with tf.name_scope("user_params"), tf.device(self.device): # shape params are read off immediately from update equations # rate params set to be consistent w \gam_i ~ 1, \sum_j beta_jk beta_k ~ \sqrt(e/k) (which is self consistent) if ppm : # If creating the principled predictive (ppm), don't have the user_degree. Just create some random initialization for now, we'll update it with a default value self.gam_shp = tf.Variable(tf.random_gamma([self.U, 1], 5., 5., seed=self.seed), dtype=tf.float32, name="gam_rte") self.gam_rte = tf.Variable(tf.random_gamma([self.U, 1], 5., 5., seed=self.seed), dtype=tf.float32, name="gam_rte") self.theta_shp = tf.Variable(tf.random_gamma([self.U, self.K], 10., 10., seed=self.seed), name="theta_shp") self.theta_rte =tf.Variable(tf.random_gamma([self.U, self.K], 5., 5., seed=self.seed), name="theta_rte") self.g = tf.Variable(tf.random_gamma([self.K, 1], 0.001, 1, seed=self.seed) + TINY, name="g") else: user_degs = np.expand_dims(self.user_degree, axis=1) self.gam_shp = tf.Variable((user_degs - su), name="gam_shp") # s^U self.gam_rte = tf.Variable(np.sqrt(e) * (0.9 + 0.1*tf.random_gamma([self.U, 1], 5., 5., seed=self.seed)), dtype=tf.float32, name="gam_rte") # r^U init_gam_mean = self.gam_shp.initial_value / self.gam_rte.initial_value self.theta_shp = tf.Variable((a + user_degs/K) * tf.random_gamma([self.U, self.K], 10., 10., seed=self.seed), name="theta_shp") # kap^U self.theta_rte = tf.Variable((b + init_gam_mean * tim_init)*(0.9 + 0.1*tf.random_gamma([self.U, self.K], 5., 5., seed=self.seed)), name="theta_rte") # lam^U self.g = tf.Variable(tf.random_gamma([self.K, 1], 0.001, 1, seed=self.seed) + TINY, name="g") # g with tf.name_scope("item_params"), tf.device(self.device): ## Items if ppm: self.omega_shp = tf.Variable(tf.random_gamma([self.I, 1], 5., 5., seed=self.seed), name="omega_shp") # s^I self.omega_rte = tf.Variable(tf.random_gamma([self.I, 1], 5., 5., seed=self.seed), dtype=tf.float32, name="omega_rte") # r^I self.beta_shp = tf.Variable(tf.random_gamma([self.I, self.K], 10., 10., seed=self.seed), name="beta_shp") # kap^I self.beta_rte = tf.Variable(tf.random_gamma([self.I, self.K], 5., 5., seed=self.seed), name="beta_rte") # lam^I self.w = tf.Variable(tf.random_gamma([self.K, 1], 0.001, 1, seed=self.seed) + TINY, name="w") # w else: item_degs = np.expand_dims(self.item_degree, axis=1) self.omega_shp = tf.Variable((item_degs - si), name="omega_shp") # s^I self.omega_rte = tf.Variable(np.sqrt(e) * (0.9 + 0.1*tf.random_gamma([self.I, 1], 5., 5., seed=self.seed)), dtype=tf.float32, name="omega_rte") # r^I init_omega_mean = self.omega_shp.initial_value / self.omega_rte.initial_value self.beta_shp = tf.Variable((c + item_degs/K) * tf.random_gamma([self.I, self.K], 10., 10., seed=self.seed), name="beta_shp") # kap^I self.beta_rte = tf.Variable((d + init_omega_mean*tum_init) * (0.9 + 0.1*tf.random_gamma([self.I, self.K], 5., 5., seed=self.seed)), name="beta_rte") # lam^I self.w = tf.Variable(tf.random_gamma([self.K, 1], 0.001, 1, seed=self.seed) + TINY, name="w") # w with tf.device('/cpu:0'): with tf.variable_scope("edge_params", reuse=None): ## Edges if self.simple_graph: # set init value so there's approximately 1 expected edge between each pair... WARNING: this may be profoundly stupid self.sg_edge_param = tf.get_variable(name="sg_edge_param", shape=[self.occupied_pairs, self.K], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=-np.log(K), stddev=1. / K, seed=self.seed), partitioner=tf.fixed_size_partitioner(self.edge_param_splits, 0)) else: self.lphi = tf.get_variable(name="lphi", shape=[self.occupied_pairs, self.K], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1. / K, seed=self.seed), partitioner=tf.fixed_size_partitioner(self.edge_param_splits, 0)) with tf.name_scope("variational_post"), tf.device(self.device): # Variational posterior distributions self.q_gam = Gamma(concentration=self.gam_shp, rate=self.gam_rte, name="q_gam") self.q_theta = Gamma(concentration=self.theta_shp, rate=self.theta_rte, name="q_theta") self.q_g = PointMass(self.g, name="q_g") self.q_omega = Gamma(concentration=self.omega_shp, rate=self.omega_rte, name="q_omega") self.q_beta = Gamma(concentration=self.beta_shp, rate=self.beta_rte, name="q_beta") self.q_w = PointMass(self.w, name="q_w") if self.simple_graph: self.q_e_aux_vals = tPoissonMulti(log_lams=self.sg_edge_param, name="q_e_aux_vals") # q_edges_aux_flat else: self.q_e_aux_vals = Multinomial(total_count=self.edge_vals, logits=self.lphi, name="q_e_aux_vals") # q_edges_aux_flat self.q_e_aux_vals_mean = self.q_e_aux_vals.mean() with tf.name_scope("degree_vars"): # create some structures to make it easy to work with the expected value (wrt q) of the edges # qm_du[u,k] is the expected weighted degree of user u counting only edges of type k # qm_du[u,k] = E_q[e^k_i.] in the language of the paper # initialized arbitrarily, will override at end of init to set to # we use a tf.Variable here to cache the q_e_aux_vals.mean() value self.qm_du = tf.Variable(tf.ones([self.U, self.K], dtype=tf.float32), name="qm_du") self.qm_di = tf.Variable(tf.ones([self.I, self.K], dtype=tf.float32), name="qm_di") # Total Item Mass: self.i_tot_mass_m = self.q_w.mean() + tf.matmul(self.q_beta.mean(), self.q_omega.mean(), transpose_a=True) # Total User Mass: self.u_tot_mass_m = self.q_g.mean() + tf.matmul(self.q_theta.mean(), self.q_gam.mean(), transpose_a=True)
def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, scope=None): """Create embedding matrix for both encoder and decoder. Args: share_vocab: A boolean. Whether to share embedding matrix for both encoder and decoder. src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_partitions: number of partitions used for the embedding vars. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if use share_vocab but source and target have different vocab size. """ if num_partitions <= 1: partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. partitioner = tf.fixed_size_partitioner(num_partitions) if (src_embed_file or tgt_embed_file) and partitioner: raise ValueError( "Can't set num_partitions > 1 when using pretrained embedding") with tf.variable_scope( scope or "embeddings", dtype=dtype, partitioner=partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError("Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed( "embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: with tf.variable_scope("encoder", partitioner=partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) with tf.variable_scope("decoder", partitioner=partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) return embedding_encoder, embedding_decoder
def __init__(self, num_units, mem_input, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None, dtype=None, use_beam=False, hps=None): """Initialize the HyperLSTM cell. Args: num_units: int, The number of units in the LSTM cell. mem_input: mem_input. use_peepholes: bool, use peephole connections or not. cell_clip: (optional) A float value, if provided the cell state is clipped by this value prior to the cell output activation. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is provided, then the projected values are clipped elementwise to within `[-proj_clip, proj_clip]`. num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. forget_bias: float, The bias added to forget gates (see above). Must set to `0.0` manually when restoring from CudnnLSTM-trained checkpoints. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. If False, they are concatenated along the column axis. The latter behavior will soon be deprecated. activation: Activation function of the inner states. Default: `tanh`. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. name: String, the name of the layer. Layers with the same name will share weights, but to avoid mistakes we require reuse=True in such cases. dtype: Default dtype of the layer (default of `None` means use the type of the first input). Required when `build` is called before `call`. use_beam: Use beam search or not. hps: hyperparameters. """ super(HyperLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) if not state_is_tuple: tf.logging.warn("%s: Using a concatenated state is slower and will soon " "be deprecated. Use state_is_tuple=True.", self) if num_unit_shards is not None or num_proj_shards is not None: tf.logging.warn( "%s: The num_unit_shards and proj_unit_shards parameters are " "deprecated and will be removed in Jan 2017. " "Use a variable scope with a partitioner instead.", self) assert not use_peepholes, "currently not supporting peephole connections" assert hps is not None # Inputs must be 2-dimensional. self.input_spec = tf.layers.InputSpec(ndim=2) self._num_units = num_units self._rank = hps.rank assert self._rank == self._num_units or self._rank == 2 * self._num_units self._use_peepholes = use_peepholes self._cell_clip = cell_clip self._initializer = initializer self._num_proj = num_proj self._proj_clip = proj_clip self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation or tf.tanh self._sigma_norm = hps.sigma_norm self._beam_width = hps.beam_width self._mem_input = mem_input self._use_beam = use_beam if num_proj: self._state_size = ( tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = ( tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units input_depth = hps.emb_dim + hps.decoder_dim # if hps.encode_neighbor: # input_depth += hps.decoder_dim h_depth = self._num_units if self._num_proj is None else self._num_proj maybe_partitioner = ( tf.fixed_size_partitioner(self._num_unit_shards) if self._num_unit_shards is not None else None) # `u`s are matrices of [input_shape, rank], `v`s being [rank, hidden_size] # they are the collection of rank-1 parameter matrices. # The full parameter matrix is constructed by taking `U\sigma V`, # with diagonal matrix `\sigma` computed in the `self.initialize` function. redundant_rank = (self._rank > self._num_units) # `u`, `v` used to construct matrix from input `x` to input_gate `i`. u_xi, v_xi = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xi = tf.get_variable( "u_xi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xi, partitioner=maybe_partitioner) self._v_xi = tf.get_variable( "v_xi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xi, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix that maps input `x` to cell_state `j`. u_xj, v_xj = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xj = tf.get_variable( "u_xj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xj, partitioner=maybe_partitioner) self._v_xj = tf.get_variable( "v_xj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xj, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps input `x` to forget_gate `f`. u_xf, v_xf = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xf = tf.get_variable( "u_xf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xf, partitioner=maybe_partitioner) self._v_xf = tf.get_variable( "v_xf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xf, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps input `x` to output_gate `o`. u_xo, v_xo = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xo = tf.get_variable( "u_xo/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xo, partitioner=maybe_partitioner) self._v_xo = tf.get_variable( "v_xo/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xo, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to input_gate `i`. u_hi, v_hi = self._orthogonal_init( shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hi = tf.get_variable( "u_hi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hi, partitioner=maybe_partitioner) self._v_hi = tf.get_variable( "v_hi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hi, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to cell_state `j`. u_hj, v_hj = self._orthogonal_init( shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hj = tf.get_variable( "u_hj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hj, partitioner=maybe_partitioner) self._v_hj = tf.get_variable( "v_hj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hj, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to forget_gate `f`. u_hf, v_hf = self._orthogonal_init( shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hf = tf.get_variable( "u_hf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hf, partitioner=maybe_partitioner) self._v_hf = tf.get_variable( "v_hf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hf, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to output_gate `o`. u_ho, v_ho = self._orthogonal_init( shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_ho = tf.get_variable( "u_ho/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_ho, partitioner=maybe_partitioner) self._v_ho = tf.get_variable( "v_ho/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_ho, partitioner=maybe_partitioner) self._c = tf.get_variable( "c/%s" % _WEIGHTS_VARIABLE_NAME, shape=[self._num_units, self._rank], initializer=tf.contrib.layers.xavier_initializer(), partitioner=maybe_partitioner) initializer = tf.zeros_initializer(dtype=tf.float32) self._b = tf.get_variable( "b/%s" % _BIAS_VARIABLE_NAME, shape=[4 * h_depth, self._rank], initializer=initializer) if self._num_proj is not None: if self._num_proj_shards is not None: maybe_proj_partitioner = ( tf.fixed_size_partitioner(self._num_proj_shards)) else: maybe_proj_partitioner = (None) self._proj_kernel = self.add_variable( "projection/%s" % _WEIGHTS_VARIABLE_NAME, shape=[self._num_units, self._num_proj], initializer=tf.uniform_unit_scaling_initializer(), partitioner=maybe_proj_partitioner) self.initialize() self.built = True
def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, scope=None): """Create embedding matrix for both encoder and decoder. Args: share_vocab: A boolean. Whether to share embedding matrix for both encoder and decoder. src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_partitions: number of partitions used for the embedding vars. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if use share_vocab but source and target have different vocab size. """ if num_partitions <= 1: partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. partitioner = tf.fixed_size_partitioner(num_partitions) if (src_embed_file or tgt_embed_file) and partitioner: raise ValueError( "Can't set num_partitions > 1 when using pretrained embedding") with tf.variable_scope(scope or "embeddings", dtype=dtype, partitioner=partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError( "Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed("embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: with tf.variable_scope("encoder", partitioner=partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) with tf.variable_scope("decoder", partitioner=partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) return embedding_encoder, embedding_decoder
def train(): global updated_batch_size_num global passed_info global shall_update ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id) sspManager = SspManager(len(worker_hosts), 5) if FLAGS.job_name == 'ps': if FLAGS.task_id == 0: rpcServer = sspManager.create_rpc_server(ps_hosts[0].split(':')[0]) rpcServer.serve() server.join() time.sleep(5) is_chief = (FLAGS.task_id == 0) rpcClient = sspManager.create_rpc_client(ps_hosts[0].split(':')[0]) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): partitioner = tf.fixed_size_partitioner(len(ps_hosts), axis=0) with tf.variable_scope('root', partitioner=partitioner): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000 * 350.0 / FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) # print (str(tf.shape(images))+ str(tf.shape(labels))) re = tf.shape(images)[0] inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) # labels = tf.reshape(labels, [-1, _NUM_CLASSES]) print(labels.get_shape()) labels = tf.one_hot(labels, 10, 1, 0) print(labels.get_shape()) network_fn = nets_factory.get_network_fn('alexnet_v2', num_classes=10) (logits, _) = network_fn(inputs) print(logits.get_shape()) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # logits = cifar10.inference(images, batch_size) # loss = cifar10.loss(logits, labels, batch_size) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) train_op = cifar10.train(loss, global_step) # Decay the learning rate exponentially based on the number of steps. sv = tf.train.Supervisor( is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, # saver=saver, saver=None, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess_config.gpu_options.allow_growth = True # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # sess.run(tf.global_variables_initializer()) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) # sv.start_queue_runners(sess, chief_queue_runners) # sess.run(init_tokens_op) """Train CIFAR-10 for a number of steps.""" # available_cpu = psutil.cpu_percent(interval=None) # thread = threading2.Thread(target = local_update_batch_size, name = "update_batch_size_thread", args = (rpcClient, FLAGS.task_id,)) # thread.start() time0 = time.time() batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # batch_size_num = updated_batch_size_num if step <= 5: batch_size_num = FLAGS.batch_size if step >= 0: batch_size_num = int(step / 5) % 512 + 1 batch_size_num = 128 num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # mgrads, images_, train_val, real, loss_value, gs = sess.run([grads, images, train_op, re, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) _, loss_value, gs = sess.run( [train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) # _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num}) b = time.time() # tl = timeline.Timeline(run_metadata.step_stats) ## ctf = tl.generate_chrome_trace_format() # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') # thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) # thread.start() # available_cpu = 100-psutil.cpu_percent(interval=None) # available_memory = psutil.virtual_memory()[1]/1000000 c0 = time.time() # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) # if gs < 10: # with open('timeline.json', 'w') as f: # f.write(ctf) # tf.logging.info('write json') # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, 0,0,0, step, batch_size_num) if step % 1 == 0: duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) c = time.time() ## tf.logging.info("time statistics - batch_process_time: " + str( last_batch_time) + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0)) format_str = ( "time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) rpcClient.check_staleness(FLAGS.task_id, step)
def train_ncf(cluster, rank, nrank, args): def validate(): # validate phase hits, ndcgs = [], [] for idx in range(testData.shape[0]): start_index = idx * 100 my_feed_dict = { user_input: testUserInput[start_index:start_index+100], item_input: testItemInput[start_index:start_index+100], } predictions = sess.run([y], feed_dict=my_feed_dict) map_item_score = {testItemInput[start_index+i]: predictions[0][i] for i in range(100)} # Evaluate top rank list ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, testItemInput[start_index]) ndcg = getNDCG(ranklist, testItemInput[start_index]) hits.append(hr) ndcgs.append(ndcg) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() return hr, ndcg def get_current_shard(data): part_size = data.shape[0] // nrank start = part_size * rank end = start + part_size if rank != nrank - 1 else data.shape[0] return data[start:end] from movielens import getdata if args.all: trainData, testData = getdata('ml-25m', 'datasets') trainUsers = get_current_shard(trainData['user_input']) trainItems = get_current_shard(trainData['item_input']) trainLabels = get_current_shard(trainData['labels']) testData = get_current_shard(testData) testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1,)) else: trainData, testData = getdata('ml-25m', 'datasets') trainUsers = get_current_shard(trainData['user_input'][:1024000]) trainItems = get_current_shard(trainData['item_input'][:1024000]) trainLabels = get_current_shard(trainData['labels'][:1024000]) testData = get_current_shard(testData[:1470]) testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1,)) num_users, num_items = { 'ml-1m': (6040, 3706), 'ml-20m': (138493, 26744), 'ml-25m': (162541, 59047), }['ml-25m'] batch_size = 1024 num_negatives = 4 topK = 10 worker_device = "/job:worker/task:%d/gpu:0" % (rank) with tf.device(worker_device): user_input = tf.compat.v1.placeholder(tf.int32, [None, ]) item_input = tf.compat.v1.placeholder(tf.int32, [None, ]) y_ = tf.compat.v1.placeholder(tf.float32, [None, ]) with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)): server_num = len(cluster.as_dict()['ps']) embed_partitioner = tf.fixed_size_partitioner(server_num, 0) if server_num > 1 else None loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items, embed_partitioner) train_op = opt.minimize(loss) server = tf.train.Server( cluster, job_name="worker", task_index=rank) init = tf.compat.v1.global_variables_initializer() sv = tf.train.Supervisor( is_chief=(rank == 0), init_op=init, recovery_wait_secs=1) sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % rank]) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # sess.run(init) log = Logging(path='logs/tflog%d.txt' % rank) epoch = 7 iterations = trainUsers.shape[0] // batch_size start = time.time() for ep in range(epoch): ep_st = time.time() log.write('epoch %d' % ep) train_loss = [] for idx in tqdm(range(iterations)): start_index = idx * batch_size my_feed_dict = { user_input: trainUsers[start_index:start_index+batch_size], item_input: trainItems[start_index:start_index+batch_size], y_: trainLabels[start_index:start_index+batch_size], } loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) train_loss.append(loss_val[0]) # if idx % 10000 == 0: # hr, ndcg = validate() # printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg) # log.write(printstr) tra_loss = np.mean(train_loss) ep_en = time.time() # validate phase if args.val: hr, ndcg = validate() printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (tra_loss, hr, ndcg, ep_en - ep_st) else: printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time: %f' % (time.time() - start))
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) configP = tf.ConfigProto() server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id, config=configP) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.device(device_setter): """Prepare Input""" global_step = tf.Variable(0, trainable=False) decay_steps = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * NUM_EPOCHS_PER_DECAY / FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') with tf.device('/cpu:0'): images, labels = cifar10.distorted_inputs(batch_size) inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) """Inference""" with tf.variable_scope('root', partitioner=tf.fixed_size_partitioner( len(ps_hosts), axis=0)): network = resnet_model.cifar10_resnet_v2_generator( FLAGS.resnet_size, _NUM_CLASSES) logits = network(inputs, True) labels = tf.cast(labels, tf.int64) correct_prediction = tf.equal(tf.argmax(logits, 1), labels) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy_op = tf.reduce_mean(correct_prediction) """Loss""" labels = tf.one_hot(labels, 10, 1, 0) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) """Define Optimization""" # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE * len(worker_hosts), global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # Compute gradients with respect to the loss. grads = opt.compute_gradients(loss) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') """Sychronization Management""" if is_chief: chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver(max_to_keep=1) sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, saver=saver, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) """Train CIFAR-10 for a number of steps.""" sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) batch_size_num = FLAGS.batch_size for step in range(init_global_step, FLAGS.max_steps): step_start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) _, loss_value, gs = sess.run( [train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) duration = time.time() - step_start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( "time: " + str(time.time()) + '; %s: step %d (gs %d), loss= %.2f (%.1f samples/s; %.3f s/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) """Do evaluation on accuracy (this is not testset evaluation)""" if step % 200 == 0: accuracy = sess.run(accuracy_op, feed_dict={batch_size: 10000}) tf.logging.info('evaluation: step - ' + str(step) + '; accuracy: ' + str(accuracy))