def cifar10_model(learning_rate, objectiveFunc, hparam, act_func): tf.reset_default_graph() sess = tf.InteractiveSession(config=tf.ConfigProto( gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.4))) # Define input placeholders # images_placeholder - x x = tf.placeholder(tf.float32, shape=[None, IMAGE_PIXELS], name='images') x_image = tf.reshape(x, [-1, 32, 32, 1]) tf.summary.image('input', x_image, 3) # labels_placeholder - y_ y_ = tf.placeholder(tf.int64, shape=[None], name='image-labels') keep_prob = tf.placeholder(tf.float32) y = tf.one_hot(y_, 10, 1.0, 0.0, -1) h1, W1, B1 = fc_layer(x, IMAGE_PIXELS, 100, act_func, "h1") logit, W2, B2 = logits(h1, 100, 10) Y = tf.nn.softmax(logit) ## changing loss function if objectiveFunc == "mean_sq_err": with tf.name_scope("mean_sq_err"): mean_sq_err = tf.reduce_mean( tf.contrib.keras.losses.mean_squared_error(Y, y)) tf.summary.scalar("mean_sq_err", mean_sq_err) loss = mean_sq_err elif objectiveFunc == "L2_norm": with tf.name_scope("L2_norm"): xent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits=logit, labels=y), name="xent") L2_lambda = 0.05 L2_norm = xent + \ L2_lambda * (tf.nn.l2_loss(W1) + tf.nn.l2_loss(B1) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(B2)) tf.summary.scalar("L2_norm", L2_norm) loss = L2_norm else: with tf.name_scope("xent"): xent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits=logit, labels=y), name="xent") tf.summary.scalar("xent", xent) loss = xent with tf.name_scope("train"): train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) with tf.name_scope("accuracy"): correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar("accuracy", accuracy) summ = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) writer_train = tf.summary.FileWriter(LOGDIR + hparam + "_train") writer_train.add_graph(sess.graph) writer_test = tf.summary.FileWriter(LOGDIR + hparam + "_test") writer_test.add_graph(sess.graph) num_epochs = 200 # training accuracy list_test_acc = list() # Generate input data batches zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) # batch size 400, max steps 2000 batches = data_helpers.gen_batch(list(zipped_data), 100, 500 * 100 * num_epochs) for k in range(num_epochs): print(str(k) + "th epoch") for i in range(500): batch = next(batches) batch_xs, batch_ys = zip(*batch) feed_dict = {x: batch_xs, y_: batch_ys} if i % 100 == 0: [train_accuracy, s_train] = sess.run([accuracy, summ], feed_dict=feed_dict) writer_train.add_summary(s_train, k * 500 + i) [test_accuracy, s_test] = sess.run([accuracy, summ], feed_dict={ x: data_sets['images_test'], y_: data_sets['labels_test'] }) writer_test.add_summary(s_test, k * 500 + i) print("train accuracy: " + str(train_accuracy)) print("test accuracy: " + str(test_accuracy)) sess.run(train_step, feed_dict=feed_dict) test_acc = accuracy.eval(feed_dict={ x: data_sets['images_test'], y_: data_sets['labels_test'] }) list_test_acc.append(test_acc) if k > 10 and np.mean(list_test_acc[-10:-5]) > np.mean( list_test_acc[-5:]): print("Seems like it starts to overfit, aborting the training") break
# ----------------------------------------------------------------------------- with tf.Session() as sess: # Initialize variables and create summary-writer sess.run(tf.global_variables_initializer()) # 创建一个汇总编辑器,使其定期将日志信息保存到磁盘。 summary_writer = tf.summary.FileWriter(logdir, sess.graph) # Generate input data batches # 负责生成批输入数据。让我们假设我们有100个训练图像,批次大小为10。 # 在softmax示例中,我们只为每次迭代选择了10个随机图像,特别注意是随机。 zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) # 对训练数据集的100个图像随机混洗。混洗之后的数据的前10个图像作为我们的第一个批次, # 接下来的10个图像是我们的第二批,后面的批次以此类推。10批后,在数据集的末尾,再重复混洗过程 batches = data_helpers.gen_batch(list(zipped_data), FLAGS.batch_size, FLAGS.max_steps) for i in range(FLAGS.max_steps): # Get next input data batch batch = next(batches) images_batch, labels_batch = zip(*batch) feed_dict = { images_placeholder: images_batch, labels_placeholder: labels_batch } # Periodically print out the model's current accuracy if i % 100 == 0: train_accuracy = sess.run(accuracy, feed_dict=feed_dict) print('Step {:d}, training accuracy {:g}'.format(i, train_accuracy))
name='image-labels') logits = sigmoid_two_layer_inference( images_placeholder, pixel_count, FLAGS.hidden1, CLASSES, reg_constant=FLAGS.reg_constant) # build model loss = loss(logits, labels_placeholder) train_step = training(loss, FLAGS.learning_rate) accuracy = evaluation(logits, labels_placeholder) report = report(logits, labels_placeholder) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) zipped_data = zip(data['images_train'], data['labels_train']) batches = gen_batch(list(zipped_data), FLAGS.batch_size, FLAGS.max_steps) for i in range(FLAGS.max_steps): batch = next(batches) images_batch, labels_batch = zip(*batch) feed_dict = { images_placeholder: images_batch, labels_placeholder: labels_batch } if i % 100 == 0: train_accuracy = sess.run(accuracy, feed_dict=feed_dict) print('Step {:d}, training accuracy {:g}'.format( i, train_accuracy)) sess.run([train_step, loss], feed_dict=feed_dict)
def main(_): # cluster specification parameter_servers = ["spaceml1:2222"] workers = ["spaceml1:2223", "spaceml1:2224", "spaceml1:2225", "spaceml1:2226"] num_workers = len(workers) cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers}) #local server, either ps or worker server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) data_sets = data_helpers.load_data() if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # Create the model x = tf.placeholder(tf.float32, shape=[None, 224, 224, 3]) y_ = tf.placeholder(tf.int64, shape=[None]) keep_prob = tf.placeholder(tf.float32) x_reshaped = tf.reshape(x, [-1, 224, 224, 3]) #First convolutional layer, (224, 224, 3) to (56, 56, 96) W_conv1 = weight_variable([11, 11, 3, 96]) #W_conv1 = tf.Variable(tf.) b_conv1 = bias_variable([96]) # convert it to (56,56,96) now h_conv1 = tf.nn.relu(conv2d(x_reshaped, W_conv1, [1, 4, 4, 1]) + b_conv1) # print h_conv1.get_shape() #max_pool1 = tf.nn.max_pool(h_conv1, ksize = [1,3,3,1], strides = [1,2,2,1], padding='SAME' # (56,56,96)->(28,28,96) norm1 = tf.nn.lrn(h_conv1, 5, bias = 1.0, alpha = 0.001 / 9.0, beta = 0.75) max_pool1 = tf.nn.max_pool(norm1, ksize = [1,3,3,1], strides = [1,2,2,1], padding='SAME') # print max_pool1.get_shape() #h_conv1 = tf.nn.relu(conv2d(x_reshaped, W_conv1, [1, 1, 1, 1]) + b_conv1 # # Second convolutional layer, (28,28,96) to (28, 28, 256) to (14,14,256) W_conv2 = weight_variable([5, 5, 96, 256]) b_conv2 = bias_variable([256]) h_conv2 = tf.nn.relu(conv2d(max_pool1, W_conv2, [1, 1, 1, 1]) + b_conv2) #print h_conv2.get_shape() #h_pool2 = tf.nn.max_pool(h_conv2, ksize = [1,3,3,1], strides = [1,2,2,1], padding='SAME' norm2 = tf.nn.lrn(h_conv2, 5, bias = 1.0, alpha = 0.001 / 9.0, beta = 0.75) h_pool2 = tf.nn.max_pool(norm2, ksize = [1,3,3,1], strides = [1,2,2,1], padding='SAME') # #print h_pool2.get_shape() # Third convolutional layer, (14,14,256) to (14, 14, 384) W_conv3 = weight_variable([3, 3, 256, 384]) b_conv3 = bias_variable([384]) h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3, [1, 1, 1, 1]) + b_conv3) #print h_conv3.get_shape() # # Fourth convolutional layer, (14, 14, 384) to (14, 14, 384) W_conv4 = weight_variable([3, 3, 384, 384]) b_conv4 = bias_variable([384]) h_conv4 = tf.nn.relu(conv2d(h_conv3, W_conv4, [1, 1, 1, 1]) + b_conv4) # #print h_conv4.get_shape() # Fifth convolutional layer, (14, 14, 384) to (7, 7, 256) W_conv5 = weight_variable([3, 3, 384, 256]) b_conv5 = bias_variable([256]) h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5, [1, 1, 1, 1]) + b_conv5) max_pooling5 = tf.nn.max_pool(h_conv5, ksize = [1,3,3,1], strides = [1,2,2,1], padding='SAME') # #print max_pooling5.get_shape() # First fully-connected laye W_fc1 = relu_weight_variable([7*7*256, 4096]) b_fc1 = bias_variable([4096]) h_conv5_flat = tf.reshape(max_pooling5, [-1, 7*7*256]) #print h_conv5_flat.get_shape() h_fc1 = tf.nn.relu(fc_batch_normalization(tf.matmul(h_conv5_flat, W_fc1) + b_fc1)) h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) # # Second fully-connected laye W_fc2 = relu_weight_variable([4096,4096]) b_fc2 = bias_variable([4096]) h_fc2 = tf.nn.relu(fc_batch_normalization(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)) h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob) # # Third fully-connected laye W_fc3 = relu_weight_variable([4096, num_classes]) b_fc3 = bias_variable([num_classes]) y_score = fc_batch_normalization(tf.matmul(h_fc2_drop, W_fc3) + b_fc3) y_logit = tf.nn.softmax(y_score) cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_score, labels=y_)) train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_logit, 1), y_) #y_max = tf.reduce_min(tf.reduce_max(y_logit,1)) #y_label_max = tf.reduce_max(y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #init_token_op = opt.get_init_tokens_op() #chief_queue_runner = opt.get_chief_queue_runner() saver = tf.train.Saver() init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir="/mnt/ds3lab/litian/logs", init_op=init_op, saver=saver) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 64, 50000) with sv.managed_session(server.target) as sess: begin = time.time() test_time = 0 for i in range(50000): batch = next(batches) image_batch, label_batch = zip(*batch) image_batch = np.reshape(test_batch, [-1,32,32,3]) image_batch = tf.image.resize_images(image_batch,[224,224]) image_batch = sess.run(image_batch) if i % 500 == 0 and (i / 500) % num_workers == FLAGS.task_index: test_batch = data_sets['images_test'] test_batch = np.reshape(test_batch, [-1,32,32,3]) test_batch = tf.image.resize_images(test_batch,[224,224]) test_batch=sess.run(test_batch) val_accuracy=[] for i in range (0,100): val_accuracy.append(sess.run(accuracy, feed_dict={x: test_batch[i*100:(i+1)*100], y_: data_sets['labels_test'][i*100:(i+1)*100], keep_prob:1.0})) sum_ = 0 for i in range(0, len(val_accuracy)): sum_ += val_accuracy[i] avg_accuracy = sum_ / (100*1.0) print("validation set accuracy %g" % avg_accuracy) sess.run(train_step, feed_dict={x: image_batch, y_: label_batch, keep_prob: 0.5}) if i % 50 == 0: train_accuracy = sess.run(accuracy,feed_dict={x: image_batch, y_: label_batch, keep_prob: 1.0}) train_loss = sess.run(cross_entropy, feed_dict={x: image_batch, y_: label_batch, keep_prob: 1.0}) localtime = time.asctime(time.localtime(time.time())) print (localtime) tmp = time.time() print ((tmp - begin)/60.0) print("step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) #print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})) sv.stop()
import numpy as np import time from input_data_cifar import create_train_datasets from input_data_cifar import create_test_datasets import data_helpers import tensorflow as tf FLAGS = None NUM_IMAGES = 5000 num_classes = 10 data_sets = data_helpers.load_data() zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 64, 50000) # one ps and four workers def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def relu_weight_variable(shape): assert len(shape) is 2 input_size = shape[0] initial = tf.truncated_normal(shape, stddev=np.sqrt(2.0 / input_size)) return tf.Variable(initial)
step, summaries, loss, accuracy, error, wrong_predictions = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.error, cnn.wrong_predictions], feed_dict) # np.set_printoptions(threshold=np.inf) # wrong_pred_data=y_batch[wrong_predictions,:] # _, wrong_cls=np.where(wrong_pred_data==1) # print("wrong predictions: {}".format(Counter(wrong_cls))) # time_str = datetime.datetime.now().isoformat() # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) # if writer: # writer.add_summary(summaries, step) # else: return step, summaries, loss, accuracy, error for epoch in range(FLAGS.num_epochs): batches=data_helpers.gen_batch( list(zip(x_train, y_train)), FLAGS.batch_size) for batch in batches: x_batch, y_batch = zip(*batch) tr_step, tr_summaries, tr_loss, tr_accuracy, tr_error =train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if epoch % FLAGS.evaluate_every == 0: _, tr_summaries, tr_loss, tr_accuracy, tr_error =dev_step(x_train, y_train, writer=None) _, te_summaries, te_loss, te_accuracy, te_error =dev_step(x_dev, y_dev, writer=None) # write to summary train_summary_writer.add_summary(tr_summaries, epoch) dev_summary_writer.add_summary(te_summaries, epoch) train_summary_writer.flush() dev_summary_writer.flush() time_str = datetime.datetime.now().isoformat()
def cifar10_model(learning_rate, regularization, hparam, dropout_rate, n_hidden_layer, n_hidden_unit, act_func): tf.reset_default_graph() sess = tf.InteractiveSession(config=tf.ConfigProto( gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.7))) # input layer x = tf.placeholder(tf.float32, shape=[None, IMAGE_PIXELS], name='images') x_image = tf.reshape(x, [-1, 32, 32, 1]) tf.summary.image('input', x_image, 3) # label to compare y_ = tf.placeholder(tf.int64, shape=[None], name='image-labels') keep_prob = tf.placeholder(tf.float32) y = tf.one_hot(y_, 10, 1.0, 0.0, -1) layers = [] if regularization == "drop_out": for i in range(n_hidden_layer): if i == 0: layers.insert( i, tf.nn.dropout( fc_layer(x, IMAGE_PIXELS, n_hidden_unit, act_func, "h" + str(i + 1)), keep_prob)) else: layers.insert( i, tf.nn.dropout( fc_layer(layers[i - 1], n_hidden_unit, n_hidden_unit, act_func, "h" + str(i + 1)), keep_prob)) logit, W, B = logits(layers[n_hidden_layer - 1], n_hidden_unit, 10) elif regularization == 'batch_normalization': for i in range(n_hidden_layer): if i == 0: layers.insert( i, batch_layer(x, IMAGE_PIXELS, n_hidden_unit, act_func, "h" + str(i + 1))) else: layers.insert( i, batch_layer(layers[i - 1], n_hidden_unit, n_hidden_unit, act_func, "h" + str(i + 1))) logit = batch_logits(layers[n_hidden_layer - 1], n_hidden_unit, 10, act_func) else: for i in range(n_hidden_layer): if i == 0: layers.insert( i, fc_layer(x, IMAGE_PIXELS, n_hidden_unit, act_func, "h" + str(i + 1))) else: layers.insert( i, fc_layer(layers[i - 1], n_hidden_unit, n_hidden_unit, act_func, "h" + str(i + 1))) logit, W, B = logits(layers[n_hidden_layer - 1], n_hidden_unit, 10) ## softmax layer - last layer for classification Y = tf.nn.softmax(logit) # loss function with tf.name_scope("xent"): xent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits=logit, labels=y), name="xent") tf.summary.scalar("xent", xent) with tf.name_scope("train"): train_step = tf.train.AdamOptimizer(learning_rate).minimize(xent) with tf.name_scope("accuracy"): correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar("accuracy", accuracy) summ = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) writer_train = tf.summary.FileWriter(LOGDIR + hparam + "_train") writer_train.add_graph(sess.graph) writer_test = tf.summary.FileWriter(LOGDIR + hparam + "_test") writer_test.add_graph(sess.graph) num_epochs = 200 # training accuracy list_test_acc = list() # Generate input data batches zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) # batch size : 100, max steps : (steps in a single epoch) * num of epochs batches = data_helpers.gen_batch(list(zipped_data), 100, 500 * 100 * num_epochs) for k in range(num_epochs): print(str(k) + "th epoch") for i in range(500): batch = next(batches) batch_xs, batch_ys = zip(*batch) if i % 100 == 0: [train_accuracy, s_train] = sess.run([accuracy, summ], feed_dict={ x: batch_xs, y_: batch_ys, keep_prob: 1 }) writer_train.add_summary(s_train, k * 500 + i) [test_accuracy, s_test] = sess.run( [accuracy, summ], feed_dict={ x: data_sets['images_test'], y_: data_sets['labels_test'], keep_prob: 1 }) writer_test.add_summary(s_test, k * 500 + i) print('Step {:d}, training accuracy {:g}'.format( k * 500 + i, train_accuracy)) print('Step {:d}, test accuracy {:g}'.format( k * 500 + i, test_accuracy)) # dropout_rate will only be used when dropout is enabled sess.run(train_step, feed_dict={ x: batch_xs, y_: batch_ys, keep_prob: dropout_rate }) test_acc = accuracy.eval( feed_dict={ x: data_sets['images_test'], y_: data_sets['labels_test'], keep_prob: 1 }) list_test_acc.append(test_acc) # use early stopping if k > 10 and np.mean(list_test_acc[-10:-5]) > np.mean( list_test_acc[-5:]): print("Seems like it starts to overfit, aborting the training") break
def main(_): # cluster specification # parameter_servers = ["sgs-gpu-02:2222", "sgs-gpu-02:2223", "sgs-gpu-03:2222", "sgs-gpu-03:2223"] # workers = ["sgs-gpu-02:2224", "sgs-gpu-02:2225", "sgs-gpu-03:2224", "sgs-gpu-03:2225"] parameter_servers = [ "spaceml1:2222", "spaceml1:2223", "spaceml1:2224", "spaceml1:2225" ] workers = [ "spaceml1:2226", "spaceml1:2227", "spaceml1:2228", "spaceml1:2229" ] num_ps = len(parameter_servers) num_worker = num_ps cluster = tf.train.ClusterSpec({ "ps": parameter_servers, "worker": workers }) #local server, either ps or worker server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) data_sets = data_helpers.load_data() W1 = [0, 0, 0, 0] b1 = [0, 0, 0, 0] W2 = [0, 0, 0, 0] b2 = [0, 0, 0, 0] if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device("/job:ps/task:0"): W1[0] = tf.get_variable( name='w10', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) # W1[0] = tf.Variable(tf.random_normal([3072,240])) b1[0] = tf.Variable(tf.zeros([240])) W2[0] = tf.get_variable( name='w20', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W2[0] = tf.Variable(tf.random_normal([240,10])) b2[0] = tf.Variable(tf.zeros([10])) with tf.device("/job:ps/task:1"): W1[1] = tf.get_variable( name='w11', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W1[1] = tf.Variable(tf.random_normal([3072,240])) b1[1] = tf.Variable(tf.zeros([240])) W2[1] = tf.get_variable( name='w21', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) # W2[1] = tf.Variable(tf.random_normal([240,10])) b2[1] = tf.Variable(tf.zeros([10])) with tf.device("/job:ps/task:2"): W1[2] = tf.get_variable( name='w12', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W1[2] = tf.Variable(tf.random_normal([3072,240])) b1[2] = tf.Variable(tf.zeros([240])) W2[2] = tf.get_variable( name='w22', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W2[2] = tf.Variable(tf.random_normal([240,10])) b2[2] = tf.Variable(tf.zeros([10])) with tf.device("/job:ps/task:3"): W1[3] = tf.get_variable( name='w13', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) # W1[3] = tf.Variable(tf.random_normal([3072,240])) b1[3] = tf.Variable(tf.zeros([240])) W2[3] = tf.get_variable( name='w23', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) #W2[3] = tf.Variable(tf.random_normal([240,10])) b2[3] = tf.Variable(tf.zeros([10])) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # Create the model x = tf.placeholder(tf.float32, shape=[None, 3072]) y_ = tf.placeholder(tf.int64, shape=[None]) h1 = tf.nn.relu( tf.matmul(x, W1[FLAGS.task_index]) + b1[FLAGS.task_index]) y = tf.matmul(h1, W2[FLAGS.task_index]) + b2[FLAGS.task_index] cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) opt = tf.train.GradientDescentOptimizer(FLAGS.lr) grads_and_vars = opt.compute_gradients(cross_entropy, [ W1[FLAGS.task_index], b1[FLAGS.task_index], W2[FLAGS.task_index], b2[FLAGS.task_index] ]) # w = W2[FLAGS.task_index] # b = b2[FLAGS.task_index] new_gv0 = (grads_and_vars[0][0] - (W1[(FLAGS.task_index - 1) % num_ps] + W1[ (FLAGS.task_index + 1) % num_ps] - 2 * W1[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[0][1]) new_gv1 = (grads_and_vars[1][0] - (b1[(FLAGS.task_index - 1) % num_ps] + b1[ (FLAGS.task_index + 1) % num_ps] - 2 * b1[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[1][1]) new_gv2 = (grads_and_vars[2][0] - (W2[(FLAGS.task_index - 1) % num_ps] + W2[ (FLAGS.task_index + 1) % num_ps] - 2 * W2[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[2][1]) new_gv3 = (grads_and_vars[3][0] - (b2[(FLAGS.task_index - 1) % num_ps] + b2[ (FLAGS.task_index + 1) % num_ps] - 2 * b2[FLAGS.task_index]) / (3 * FLAGS.lr * 1.0), grads_and_vars[3][1]) #print b1[FLAGS.task_index] g = grads_and_vars[1][0] new_gv = list() new_gv.append(new_gv0) new_gv.append(new_gv1) new_gv.append(new_gv2) new_gv.append(new_gv3) train_step = opt.apply_gradients(new_gv) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) saver = tf.train.Saver() init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir="/mnt/ds3lab/litian/logs", init_op=init_op, saver=saver) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 128, 50000) with sv.managed_session(server.target) as sess: begin = time.time() for i in range(50000): batch = next(batches) image_batch, label_batch = zip(*batch) sess.run(train_step, feed_dict={ x: image_batch, y_: label_batch }) if i % 50 == 0: train_accuracy = sess.run(accuracy, feed_dict={ x: image_batch, y_: label_batch }) train_loss = sess.run(cross_entropy, feed_dict={ x: image_batch, y_: label_batch }) localtime = time.asctime(time.localtime(time.time())) print(localtime) tmp = time.time() print((tmp - begin) / 60.0) print( "step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) sv.stop()
def main(_): # mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # list_ = [] # for line in open("/mnt/ds3lab/litian/input_data/cifar10/label3.txt"): # list_.append(['a', line.strip('\n')]) # classes = np.array(list_) # print (len(classes)) # train_dataset, mean, std = create_train_datasets(classes[:, 1], num_samples=NUM_IMAGES) # val_dataset = create_test_datasets(classes[:, 1], mean, std, num_samples=NUM_IMAGES) # val_images, val_labels = val_dataset.next_batch(20) # num_classes = len(classes) # print (num_classes) data_sets = data_helpers.load_data() # with tf.device('/gpu:0'): # Create the model x = tf.placeholder(tf.float32, shape=[None, 3072]) y_ = tf.placeholder(tf.int64, shape=[None]) w1 = tf.get_variable(name='w1', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b1 = tf.Variable(tf.zeros([240])) h1 = tf.nn.relu(tf.matmul(x, w1) + b1) w2 = tf.get_variable(name='w2', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(240))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b2 = tf.Variable(tf.zeros([10])) y = tf.matmul(h1, w2) + b2 cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) train_step = tf.train.GradientDescentOptimizer(0.0005).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) sess = tf.Session() sess.run(tf.initialize_all_variables()) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 128, 50000) for i in range(50000): # batch_xs, batch_ys = mnist.train.next_batch(100) # image_batch, label_batch = train_dataset.next_batch(60, random_crop=True) batch = next(batches) image_batch, label_batch = zip(*batch) sess.run(train_step, feed_dict={x: image_batch, y_: label_batch}) if i % 50 == 0: train_accuracy = sess.run(accuracy, feed_dict={ x: image_batch, y_: label_batch }) train_loss = sess.run(cross_entropy, feed_dict={ x: image_batch, y_: label_batch }) localtime = time.asctime(time.localtime(time.time())) print(localtime) print("step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) if i % 500 == 0: val_accuracy = sess.run(accuracy, feed_dict={ x: data_sets['images_test'], y_: data_sets['labels_test'] }) print("validation set accuracy %g" % val_accuracy)
def main(_): # cluster specification parameter_servers = ["spaceml1:2222"] workers = [ "spaceml1:2223", "spaceml1:2224", "spaceml1:2225", "spaceml1:2226" ] num_workers = len(workers) cluster = tf.train.ClusterSpec({ "ps": parameter_servers, "worker": workers }) #local server, either ps or worker server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) data_sets = data_helpers.load_data() if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # Create the model x = tf.placeholder(tf.float32, shape=[None, 3072]) y_ = tf.placeholder(tf.int64, shape=[None]) w1 = tf.get_variable( name='w1', shape=[3072, 240], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b1 = tf.Variable(tf.zeros([240])) h1 = tf.nn.relu(tf.matmul(x, w1) + b1) w2 = tf.get_variable( name='w2', shape=[240, 10], initializer=tf.truncated_normal_initializer( stddev=1.0 / np.sqrt(float(240))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b2 = tf.Variable(tf.zeros([10])) y = tf.matmul(h1, w2) + b2 cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) train_step = tf.train.GradientDescentOptimizer(0.0005).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #init_token_op = opt.get_init_tokens_op() #chief_queue_runner = opt.get_chief_queue_runner() saver = tf.train.Saver() init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir="/mnt/ds3lab/litian/logs", init_op=init_op, saver=saver) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 128, 40000) with sv.managed_session(server.target) as sess: begin = time.time() for i in range(40000): batch = next(batches) image_batch, label_batch = zip(*batch) sess.run(train_step, feed_dict={ x: image_batch, y_: label_batch }) if i % 50 == 0: train_accuracy = sess.run(accuracy, feed_dict={ x: image_batch, y_: label_batch }) train_loss = sess.run(cross_entropy, feed_dict={ x: image_batch, y_: label_batch }) localtime = time.asctime(time.localtime(time.time())) print(localtime) tmp = time.time() print((tmp - begin) / 60.0) print( "step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) #print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})) sv.stop()
def main(_): # cluster specification # in order to prevent ps from occupying GPUs, first start workers, then start parameter servers parameter_servers = ["sgs-gpu-02:2222"] workers = ["sgs-gpu-02:2223", "sgs-gpu-02:2224", "sgs-gpu-03:2222", "sgs-gpu-03:2223"] num_workers = len(workers) cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers}) #local server, either ps or worker server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) data_sets = data_helpers.load_data() if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # Create the model global_step = tf.get_variable('global_step', [], initializer = tf.constant_initializer(0), trainable = False) x = tf.placeholder(tf.float32, shape = [None, 3072]) y_ = tf.placeholder(tf.int64, shape=[None]) w1 = tf.get_variable(name='w1',shape=[3072,240], initializer=tf.truncated_normal_initializer(stddev=1.0/np.sqrt(float(3072))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b1 = tf.Variable(tf.zeros([240])) h1 = tf.nn.relu(tf.matmul(x, w1)+b1) w2 = tf.get_variable(name='w2', shape=[240,10], initializer=tf.truncated_normal_initializer(stddev=1.0/np.sqrt(float(120))), regularizer=tf.contrib.layers.l2_regularizer(0.1)) b2 = tf.Variable(tf.zeros([10])) y = tf.matmul(h1, w2) + b2 cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) opt = tf.train.GradientDescentOptimizer(0.0005) opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate = num_workers, total_num_replicas = num_workers) train_step = opt.minimize(cross_entropy, global_step = global_step) correct_prediction = tf.equal(tf.argmax(y, 1),y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #init_token_op = opt.get_init_tokens_op() #chief_queue_runner = opt.get_chief_queue_runner() saver = tf.train.Saver() init_op = tf.global_variables_initializer() init_token_op = opt.get_init_tokens_op() chief_queue_runner = opt.get_chief_queue_runner() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir="/mnt/ds3lab/litian/logs", init_op=init_op, saver=saver, global_step = global_step) zipped_data = zip(data_sets['images_train'], data_sets['labels_train']) batches = data_helpers.gen_batch(list(zipped_data), 128, 50000) # start a session sess = sv.prepare_or_wait_for_session(server.target) if FLAGS.task_index == 0: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_token_op) for i in range(50000): batch = next(batches) image_batch, label_batch=zip(*batch) sess.run(train_step, feed_dict={x: image_batch, y_: label_batch}) if i % 50 == 0: train_accuracy = sess.run(accuracy,feed_dict={x: image_batch, y_: label_batch}) train_loss = sess.run(cross_entropy, feed_dict={x: image_batch, y_: label_batch}) localtime = time.asctime(time.localtime(time.time())) print (localtime) print("step %d, training accuracy %g, training loss %g" % (i, train_accuracy, train_loss)) if i % 500 == 0 : val_accuracy = sess.run(accuracy, feed_dict={x: data_sets['images_test'], y_: data_sets['labels_test']}) print("validation set accuracy %g" % val_accuracy) #print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})) sv.stop()