def map_fun(context): tf.compat.v1.disable_v2_behavior() print(tf.__version__) sys.stdout.flush() tf.logging.set_verbosity(tf.logging.ERROR) jobName = context.jobName index = context.index clusterStr = context.properties["cluster"] delim = context.properties["SYS:delim"] epochs = int(context.properties["epochs"]) data_file = context.properties["data.file"] print(index, clusterStr) sys.stdout.flush() clusterJson = json.loads(clusterStr) cluster = tf.compat.v1.train.ClusterSpec(cluster=clusterJson) server = tf.compat.v1.train.Server(cluster, job_name=jobName, task_index=index) sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) with tf.compat.v1.device( tf.compat.v1.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): filename_queue = tf.train.string_input_producer([data_file], num_epochs=epochs) reader = tf.TextLineReader() key, value = reader.read(filename_queue) global_step = tf.compat.v1.train.get_or_create_global_step() global_step_inc = tf.compat.v1.assign_add(global_step, 1) is_chief = (index == 0) print(datetime.now().isoformat() + " started ------------------------------------") t = time.time() total_step = 0 try: with tf.compat.v1.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, config=sess_config, checkpoint_dir="./target/tmp/input_output/" + str(t)) as mon_sess: # while not mon_sess.should_stop(): while True: total_step, _, _ = mon_sess.run( [global_step_inc, key, value]) if (total_step % 10000 == 0): log_speed(total_step, t) except Exception as e: print('traceback.print_exc():') traceback.print_exc() sys.stdout.flush() finally: print(datetime.now().isoformat() + " ended --------------------------------------") log_speed(total_step, t) SummaryWriterCache.clear()
def map_func(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print ("cluster:" + str(cluster_json)) print ("job name:" + job_name) print ("current index:" + str(index)) sys.stdout.flush() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) t = time.time() if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:' + str(index), cluster=cluster)): train_ops = build_graph() print("python worker index:" + str(index)) sys.stdout.flush() try: hooks = [tf.train.StopAtStepHook(last_step=2)] with tf.train.MonitoredTrainingSession(master=server.target, config=sess_config, checkpoint_dir="./target/tmp/s1/" + str(t), hooks=hooks) as mon_sess: while not mon_sess.should_stop(): print (mon_sess.run(train_ops, feed_dict={a: [1.0, 2.0, 3.0]})) sys.stdout.flush() time.sleep(1) finally: SummaryWriterCache.clear()
def map_fun(context): print(tf.__version__) sys.stdout.flush() tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print(cluster_json) sys.stdout.flush() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): record_defaults = [[9], [tf.constant(value=9, dtype=tf.int64)], [9.0], [tf.constant(value=9.0, dtype=tf.float64)], ["9.0"]] dataset = context.flinkStreamDataSet(buffer_size=0) dataset = dataset.map(lambda record: tf.decode_csv( record, record_defaults=record_defaults)) dataset = dataset.batch(3) iterator = dataset.make_one_shot_iterator() input_records = iterator.get_next() global_step = tf.train.get_or_create_global_step() global_step_inc = tf.assign_add(global_step, 1) out_list = [input_records[0], input_records[2], input_records[4]] out = tff_ops.encode_csv(input_list=out_list) is_chief = (index == 0) t = time.time() try: with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, config=sess_config, checkpoint_dir="./target/tmp/input_output/" + str(t)) as mon_sess: # while not mon_sess.should_stop(): while True: print(index, mon_sess.run([global_step_inc, out])) sys.stdout.flush() # time.sleep(1) except Exception as e: print('traceback.print_exc():') traceback.print_exc() sys.stdout.flush() finally: SummaryWriterCache.clear()
def map_fun(context): print(tf.__version__) sys.stdout.flush() tf.logging.set_verbosity(tf.logging.ERROR) jobName = context.jobName index = context.index clusterStr = context.properties["cluster"] delim = context.properties["SYS:delim"] print(index, clusterStr) sys.stdout.flush() clusterJson = json.loads(clusterStr) cluster = tf.train.ClusterSpec(cluster=clusterJson) server = tf.train.Server(cluster, job_name=jobName, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) with tf.device( tf.train.replica_device_setter(worker_device='/job:worker/task:' + str(index), cluster=cluster)): dataset = context.flinkStreamDataSet(buffer_size=0) iterator = dataset.make_one_shot_iterator() input_records = iterator.get_next() global_step = tf.contrib.framework.get_or_create_global_step() global_step_inc = tf.assign_add(global_step, 1) is_chief = (index == 0) print(datetime.now().isoformat() + " started ------------------------------------") t = time.time() total_step = 0 try: with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, config=sess_config, checkpoint_dir="./target/tmp/input_output/" + str(t)) as mon_sess: # while not mon_sess.should_stop(): while True: total_step, _ = mon_sess.run( [global_step_inc, input_records]) if (total_step % 10000 == 0): log_speed(total_step, t) except Exception as e: print('traceback.print_exc():') traceback.print_exc() sys.stdout.flush() finally: print(datetime.now().isoformat() + " ended --------------------------------------") log_speed(total_step, t) SummaryWriterCache.clear()
def map_fun(context): tf.compat.v1.disable_v2_behavior() tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print(cluster_json) sys.stdout.flush() cluster = tf.compat.v1.train.ClusterSpec(cluster=cluster_json) server = tf.compat.v1.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.compat.v1.device( tf.compat.v1.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): global_step = tf.compat.v1.train.get_or_create_global_step() global_step_inc = tf.compat.v1.assign_add(global_step, 1) input_records = [ tf.constant([1, 2, 3]), tf.constant([1.0, 2.0, 3.0]), tf.constant(['1.0', '2.0', '3.0']) ] out = tff_ops.encode_csv(input_list=input_records, field_delim='|') fw = tff_ops.FlinkTFRecordWriter(address=context.toFlink()) w = fw.write([out]) is_chief = (index == 0) t = time.time() try: hooks = [tf.compat.v1.train.StopAtStepHook(last_step=50)] with tf.compat.v1.train.MonitoredTrainingSession( master=server.target, config=sess_config, is_chief=is_chief, checkpoint_dir="./target/tmp/with_output/" + str(t), hooks=hooks) as mon_sess: while not mon_sess.should_stop(): print(index, mon_sess.run([global_step_inc, w])) sys.stdout.flush() time.sleep(1) finally: SummaryWriterCache.clear()
def begin(self): self._counters = tf.get_collection("counters") if not self._counters: return if self._summary_writer is None and self._output_dir: self._summary_writer = SummaryWriterCache.get(self._output_dir) self._last_count = [None for _ in self._counters] self._global_step = tf.train.get_global_step() if self._global_step is None: raise RuntimeError("Global step should be created to use WordCounterHook.")
def map_fun(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() task_index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print (cluster_json) sys.stdout.flush() props = context.properties batch_size = int(props.get("batch_size")) checkpoint_dir = props.get("checkpoint_dir") export_dir = props.get("export_dir") # Parameters IMAGE_PIXELS = 28 hidden_units = 128 cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=task_index) def feed_dict(images, labels): xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs / 255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": from time import sleep while True: sleep(1) elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter(worker_device="/job:worker/task:" + str(task_index), cluster=cluster)): # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") # Variables of the hidden layer hid_w = tf.Variable( tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) # Variables of the softmax layer sm_w = tf.Variable( tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.train.get_or_create_global_step() loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) train_op = tf.train.AdagradOptimizer(0.01).minimize(loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") iter = input_iter(tf_context, batch_size) next_batch = iter.get_next() is_chief = (task_index == 0) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_index]) # The MonitoredTrainingSession takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs mon_sess = tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief, checkpoint_dir=checkpoint_dir, stop_grace_period_secs=10, max_wait_secs=300, config=sess_config, chief_only_hooks=[ExportHook(export_dir, x, prediction)]) processed = 0 while not mon_sess.should_stop(): # Run a training step asynchronously # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. try: images, labels = mon_sess.run(next_batch) processed += images.shape[0] # print mon_sess.run(next_batch) except tf.errors.OutOfRangeError: break batch_xs, batch_ys = feed_dict(images, labels) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0 and not mon_sess.should_stop(): _, step = mon_sess.run([train_op, global_step], feed_dict=feed) if step % 100 == 0: print("{0}, Task {1} step: {2} accuracy: {3}".format( datetime.now().isoformat(), task_index, step, mon_sess.run(accuracy, {x: batch_xs, y_: batch_ys}))) sys.stdout.flush() print(str(processed) + " records processed.") print("{0} stopping MonitoredTrainingSession".format(datetime.now().isoformat())) mon_sess.close() SummaryWriterCache.clear()
def __init__(self, labels_file=None, output_dir=None): self._labels_file = labels_file self._summary_writer = None if output_dir is not None: self._summary_writer = SummaryWriterCache.get(output_dir)
def map_fun(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() task_index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print(cluster_json) sys.stdout.flush() props = context.properties batch_size = int(props.get("batch_size")) epochs = int(props.get("epochs")) checkpoint_dir = props.get("checkpoint_dir") export_dir = props.get("export_dir") # Parameters IMAGE_PIXELS = 28 hidden_units = 128 session = tf.Session() signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY input_key = 'image' output_key = 'prediction' # saved_model_dir = '/home/chen/code/TensorFlowOnFlink/target/export/1539071075170/' test_log("before load") meta_graph_def = tf.saved_model.loader.load( session, [tf.saved_model.tag_constants.SERVING], export_dir=export_dir) test_log("after load") signature = meta_graph_def.signature_def x_tensor_name = signature[signature_key].inputs[input_key].name test_log(x_tensor_name) y_tensor_name = signature[signature_key].outputs[output_key].name test_log(y_tensor_name) x = session.graph.get_tensor_by_name(x_tensor_name) y = session.graph.get_tensor_by_name(y_tensor_name) # write_feed, write_op, close_op = context.getOutputWriterOp() write_feed = tf.placeholder(dtype=tf.string) write_op, close_op = tf_context.output_writer_op([write_feed]) iter = input_iter(tf_context, batch_size) next_batch = iter.get_next() prediction = tf.argmax(next_batch[1], 1, name="prediction") while True: try: images, labels, labels_ = session.run( [next_batch[0], next_batch[1], prediction]) except tf.errors.OutOfRangeError: break batch_xs, batch_ys = feed_dict(images, labels) feed = {x: batch_xs} # test_log(feed_data) # input_res = session.run(x_, feed_dict={input_data: feed_data}) # print "input_data", input_res y_res = session.run(y, feed_dict=feed) # print "y_res", y_res, "y_org", labels_ # sys.stdout.flush() for i in range(len(y_res)): example = tf.train.Example(features=tf.train.Features( feature={ 'predict_label': tf.train.Feature(int64_list=tf.train.Int64List( value=[y_res[i]])), 'label_org': tf.train.Feature(int64_list=tf.train.Int64List( value=[labels_[i]])), })) # print "write:", i sys.stdout.flush() session.run(write_op, feed_dict={write_feed: example.SerializeToString()}) session.run(close_op) SummaryWriterCache.clear()
dataset = context.flinkStreamDataSet(buffer_size=0) iterator = dataset.make_one_shot_iterator() input_records = iterator.get_next() global_step = tf.contrib.framework.get_or_create_global_step() global_step_inc = tf.assign_add(global_step, 1) is_chief = (index == 0) print (datetime.now().isoformat() + " started ------------------------------------") t = time.time() total_step = 0 try: with tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief, config=sess_config, checkpoint_dir="./target/tmp/input_output/" + str(t)) as mon_sess: # while not mon_sess.should_stop(): while True: total_step, _ = mon_sess.run([global_step_inc, input_records]) if (total_step % 10000 == 0): log_speed(total_step, t) except Exception, e: print 'traceback.print_exc():' traceback.print_exc() sys.stdout.flush() finally: print (datetime.now().isoformat() + " ended --------------------------------------") log_speed(total_step, t) SummaryWriterCache.clear() if __name__ == "__main__": map_fun(context)