def train_and_eval(): """Trains and evaluates MeshTensorflow model without TPUEstimator. TODO(lehou): Pack everything nicely as a set of APIs. """ mesh_context = None tf.logging.info('FLAGS.master: {}'.format(FLAGS.master)) resolver = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master) config = tf.ConfigProto() config.allow_soft_placement = True cluster_spec = resolver.cluster_spec() if cluster_spec: config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with tf.Session(target=resolver.master(), config=config) as sess: tf.tpu.experimental.initialize_tpu_system(resolver) mesh_context = MeshContext( sess, FLAGS.use_tpu, FLAGS.mesh_shape, unet.get_layout()) for _ in range(FLAGS.num_training_loops): _train_phase(mesh_context, config, resolver.get_master()) _eval_phase(mesh_context, config, resolver.get_master()) if FLAGS.use_tpu: with tf.Session(target=resolver.get_master(), config=config) as sess: sess.run(tpu.shutdown_system()) tf.logging.info('finished.')
def execute_tpu(self, graph_fn, inputs): """Constructs the graph, executes it on TPU and returns the result. Args: graph_fn: a callable that constructs the tensorflow graph to test. The arguments of this function should correspond to `inputs`. inputs: a list of numpy arrays to feed input to the computation graph. Returns: A list of numpy arrays or a scalar returned from executing the tensorflow graph. """ with self.test_session(graph=tf.Graph()) as sess: placeholders = [ tf.placeholder_with_default(v, v.shape) for v in inputs ] tpu_computation = tpu.rewrite(graph_fn, placeholders) sess.run(tpu.initialize_system()) sess.run([ tf.global_variables_initializer(), tf.tables_initializer(), tf.local_variables_initializer() ]) materialized_results = sess.run(tpu_computation, feed_dict=dict( zip(placeholders, inputs))) sess.run(tpu.shutdown_system()) if (hasattr(materialized_results, '__len__') and len(materialized_results) == 1 and (isinstance(materialized_results, list) or isinstance(materialized_results, tuple))): materialized_results = materialized_results[0] return materialized_results
def __init__(self, iterations): tf.logging.info("TrainLowLevelRunner: constructor") self.feature_structure = {} self.loss = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.iterations = iterations self.num_hosts = FLAGS.num_shards // FLAGS.num_shards_per_host self.scaffold_fn = None # Having two separate sessions and graphs to make the initialization faster. self.input_sess = None self.train_sess = None self.input_graph = tf.Graph() self.train_graph = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.init_sess = tf.Session(self.tpu_cluster_resolver.get_master(), config=self.session_config) self.init_sess.run(self.tpu_init) self.queue = Queue.Queue()
def __init__(self, iterations, hparams, per_host_v1=False): tf.logging.info("TrainLowLevelRunner: constructor") self.feature_structure = {} self.loss = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.is_local = ((hparams.master == "") and (hparams.tpu_name is None)) self.per_host_v1 = per_host_v1 self.iterations = iterations self.sess = None self.graph = tf.Graph() self.hparams = hparams with self.graph.as_default(): self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.resolver = get_resolver(hparams) session_config = tf.ConfigProto(allow_soft_placement=True, isolate_session_state=True) if self.hparams.tpu_name is None: master = self.hparams.master else: cluster_spec = self.resolver.cluster_spec() if cluster_spec: session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) master = self.resolver.get_master() self.sess = tf.Session(master, graph=self.graph, config=session_config) self.sess.run(self.tpu_init)
def q1(): global l_returnflag_group_size global l_linestatus_group_size returnflag_groups = np.unique(l_returnflag) linestatus_groups = np.unique(l_linestatus) l_returnflag_group_size = len(returnflag_groups) l_linestatus_group_size = len(linestatus_groups) inputs = [ tf.convert_to_tensor(l_shipdate, np.float32), tf.convert_to_tensor(l_returnflag, np.float32), tf.convert_to_tensor(l_linestatus, np.float32), tf.convert_to_tensor(l_quantity, np.float32), tf.convert_to_tensor(l_extendedprice, np.float32), tf.convert_to_tensor(l_discount, np.float32), tf.convert_to_tensor(l_tax, np.float32), tf.convert_to_tensor(returnflag_groups, np.float32), tf.convert_to_tensor(linestatus_groups, np.float32) ] tpu_computation = tpu.rewrite(q1_computation, inputs) tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) sess.run(tf.global_variables_initializer()) for i in range(0, 5): res = sess.run(tpu_computation) sess.run(tpu.shutdown_system()) print(res) return res
def __init__(self, iterations, train_steps): tf.logging.info("TrainRunner: constructor") self.feature_structure = {} self.loss = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.iterations = iterations self.sess = None self.input_sess = None self.infeed_thread = None if train_steps % iterations != 0: train_steps = iterations * int(math.ceil(train_steps / iterations)) self.train_steps = train_steps self.input_graph = tf.Graph() tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu or FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) self.config = tf.ConfigProto(operation_timeout_in_ms=600 * 60 * 1000, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) cluster_spec = self.cluster_resolver.cluster_spec() if cluster_spec: self.config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) self.init_sess = tf.Session(self.cluster_resolver.get_master(), config=self.config) self.init_sess.run(tpu_init)
def execute_tpu(self, graph_fn, inputs): """Constructs the graph, executes it on TPU and returns the result. Args: graph_fn: a callable that constructs the tensorflow graph to test. The arguments of this function should correspond to `inputs`. inputs: a list of numpy arrays to feed input to the computation graph. Returns: A list of numpy arrays or a scalar returned from executing the tensorflow graph. """ with self.test_session(graph=tf.Graph()) as sess: placeholders = [tf.placeholder_with_default(v, v.shape) for v in inputs] tpu_computation = tpu.rewrite(graph_fn, placeholders) sess.run(tpu.initialize_system()) sess.run([tf.global_variables_initializer(), tf.tables_initializer(), tf.local_variables_initializer()]) materialized_results = sess.run(tpu_computation, feed_dict=dict(zip(placeholders, inputs))) sess.run(tpu.shutdown_system()) if (hasattr(materialized_results, '__len__') and len(materialized_results) == 1 and (isinstance(materialized_results, list) or isinstance(materialized_results, tuple))): materialized_results = materialized_results[0] return materialized_results
def run(size): a_ = [] b_ = [] c_ = [] for i in range(size): a_.append((i * 1.0 + 4.0) * 2.5) b_.append((i * 1.0 + 5.0) * 2.5) c_.append((i * 1.0 + 6.0) * 0.1) inputs = [tf.constant(a_), tf.constant(b_), tf.constant(c_)] tpu_computation = tpu.rewrite(expression, inputs) tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) t1 = time() sess.run(tf.global_variables_initializer()) sess.run(tpu_computation) t2 = time() print(str(size) + " : " + str(t2 - t1)) sess.run(tpu.shutdown_system()) print('Done !')
def main(unused_argv): assert FLAGS.tpu_name if FLAGS.tpu_name.startswith('grpc://'): tpu_grpc_url = FLAGS.tpu_name else: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() sess = tf.Session(tpu_grpc_url) with sess.graph.as_default(): contrib_tpu.initialize_system() contrib_tpu.shutdown_system() output_names = ['ConfigureDistributedTPU', 'ShutdownDistributedTPU'] model_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(), output_names) print(model_def)
def __init__(self, iterations, train_steps=-1): tf.logging.info("TrainRunner: constructor") self.feature_structure = None self.loss = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.iterations = iterations self.sess = None self.input_sess = None self.infeed_thread = None if train_steps < 0: train_steps = None if train_steps is not None: if train_steps % iterations != 0: train_steps = iterations * int( math.ceil(train_steps / iterations)) self.train_steps = train_steps self.input_graph = tf.Graph() with tf.Graph().as_default() as self.init_graph: self.tpu_init = tpu.initialize_system() self.tpu_shutdown = tpu.shutdown_system() #self.cluster_resolver = tflex.TPUClusterResolver( self.cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu or FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) self.config = tf.ConfigProto( operation_timeout_in_ms=600 * 60 * 1000, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) cluster_spec = self.cluster_resolver.cluster_spec() if cluster_spec: self.config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) self.master = self.cluster_resolver.get_master() self.init_sess = tf.Session(self.master, graph=self.init_graph, config=self.config) tf.logging.info("TrainRunner: initializing TPU session...") if not bool(int(os.environ.get('TPU_NO_INIT', '0'))): tflex.run(self.init_sess, self.tpu_init) tf.logging.info("TrainRunner: initializing TPU session (done)") self.devices = self.init_sess.list_devices() self.cores = sorted( [x.name for x in self.devices if ':TPU:' in x.name]) self.num_cores = len(self.cores) self.tpu_cores_per_host = 8 assert self.num_cores % self.tpu_cores_per_host == 0 self.num_hosts = self.num_cores // self.tpu_cores_per_host print(self.config.cluster_def) print('cores: %d hosts: %d ip: %s' % (self.num_cores, self.num_hosts, self.master))
def __init__(self, iterations, train_steps, eval_steps): tf.logging.info("TrainAndEvalRunner: constructor") self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.eval_loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.num_hosts = FLAGS.num_cores // FLAGS.tpu_cores_per_host self.dequeue_ops = [] self.queue = Queue.Queue() self.eval_enqueue_ops = [] self.dataset_initializer = [] self.eval_dataset_initializer = [] self.iterations = iterations self.steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size self.iterator = None self.sess = None self.input_sess = None self.eval_input_sess = None self.eval_output_sess = None self.infeed_thread = None self.train_eval_thread = None self.graph = tf.Graph() self.input_graph = tf.Graph() self.eval_input_graph = tf.Graph() self.eval_output_graph = tf.Graph() if train_steps % iterations != 0: train_steps = iterations * int(math.ceil(train_steps / iterations)) self.train_steps = train_steps self.max_train_iterations = self.train_steps // iterations self.eval_steps = int(eval_steps) self.eval_batch_size = FLAGS.eval_batch_size tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu or FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) self.config = tf.ConfigProto( operation_timeout_in_ms=600 * 60 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) self.master = self.tpu_cluster_resolver.get_master() self.init_sess = tf.Session(self.master, config=self.config) self.init_sess.run(tpu_init)
def filter_sum(): inputs = [tf.convert_to_tensor(l_quantity, np.float32)] tpu_computation = tpu.rewrite(filter_sum_computation, inputs) tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) sess.run(tf.global_variables_initializer()) for i in range(0, 5): res = sess.run(tpu_computation) sess.run(tpu.shutdown_system()) print(res) return res
def apply_comp(inputs): tpu_computation = tpu.rewrite(apply, inputs) tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) sess.run(tf.global_variables_initializer()) t1 = time() sess.run(tpu_computation) t2 = time() sess.run(tpu.shutdown_system()) print(t2 - t1)
def run(tpu_computation, tpu_grpc_url): reps = 1 times = [] for i in range(reps): with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) t1 = time() sess.run(tf.global_variables_initializer()) sess.run(tpu_computation) t2 = time() print(str(i) + "_ : " + str(t2 - t1)) times.append(t2 - t1) sess.run(tpu.shutdown_system()) print(sum(times) / reps)
def group_by(): unique_groups = np.unique(l_returnflag) inputs = [ tf.convert_to_tensor(l_quantity, np.float32), tf.convert_to_tensor(l_returnflag, np.float32), tf.convert_to_tensor(unique_groups, np.float32) ] tpu_computation = tpu.rewrite(group_by_computation, inputs) tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) sess.run(tf.global_variables_initializer()) for i in range(0, 5): res = sess.run(tpu_computation) sess.run(tpu.shutdown_system()) print(res)
def test_large_input(self): if test_case.FLAGS.tpu_test: input_size = 1408 min_level = 2 max_level = 6 batch_size = 2 num_boxes = 512 num_filters = 256 output_size = [7, 7] with self.test_session() as sess: features = [] for level in range(min_level, max_level + 1): feat_size = int(input_size / 2**level) features.append( tf.constant(np.reshape( np.arange(batch_size * feat_size * feat_size * num_filters, dtype=np.float32), [batch_size, feat_size, feat_size, num_filters]), dtype=tf.bfloat16)) boxes = np.array([ [[0, 0, 256, 256]] * num_boxes, ], dtype=np.float32) / input_size boxes = np.tile(boxes, [batch_size, 1, 1]) tf_boxes = tf.constant(boxes) tf_levels = tf.random_uniform([batch_size, num_boxes], maxval=5, dtype=tf.int32) def crop_and_resize_fn(): return spatial_ops.multilevel_roi_align( features, tf_boxes, tf_levels, output_size) tpu_crop_and_resize_fn = contrib_tpu.rewrite( crop_and_resize_fn) sess.run(contrib_tpu.initialize_system()) sess.run(tf.global_variables_initializer()) roi_features = sess.run(tpu_crop_and_resize_fn) self.assertEqual(roi_features[0].shape, (batch_size, num_boxes, output_size[0], output_size[1], num_filters)) sess.run(contrib_tpu.shutdown_system())
def __init__(self, eval_steps): tf.logging.info("EvalLowLevelRunner: constructor") tf.logging.info("eval_steps: %s", eval_steps) self.feature_structure = {} self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.eval_steps = eval_steps self.sess = None self.eval_op = None self.graph = tf.Graph() self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = {} self.saver = None self.tpu_cluster_resolver = None with self.graph.as_default(): self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system()
def timer(inputs): reps = 2 times = [] for i in range(reps): t1 = time() tpu_computation = tpu.rewrite(blackscholes, inputs) tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() with tf.Session(tpu_grpc_url) as sess: sess.run(tpu.initialize_system()) sess.run(tf.global_variables_initializer()) sess.run(tpu_computation) sess.run(tpu.shutdown_system()) t2 = time() print(str(i) + "_ : " + str(t2 - t1)) times.append(t2 - t1) print(sum(times) / reps)
def __init__(self, input_fn, model_fn, params, num_steps): self.feature_structure = {} self.loss = None self.enqueue_ops = None self.metric_initializer = None self.iterator = None self.batch_size = params["batch_size"] with tf.Graph().as_default() as self.graph: self.build_model(params, input_fn, model_fn, num_steps) self.tpu_init = tpu.initialize_system() initializer = tf.global_variables_initializer() self.tpu_shutdown = tpu.shutdown_system() self.local_initializer = tf.local_variables_initializer() self.saver = tf.train.Saver() cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu or FLAGS.master) self.sess = tf.Session(cluster_resolver.get_master(), graph=self.graph) self.sess.run(self.tpu_init) self.sess.run(initializer) self.sess.run(self.local_initializer) self.sess.run(self.iterator.initializer)
def execute_tpu_tf1(self, compute_fn, inputs, graph=None): """Executes compute_fn on TPU with Tensorflow 1.X. Args: compute_fn: a function containing Tensorflow computation that takes a list of input numpy tensors, performs computation and returns output numpy tensors. inputs: a list of numpy arrays to feed input to the `compute_fn`. graph: (optional) If not None, provided `graph` is used for computation instead of a brand new tf.Graph(). Returns: A list of numpy arrays or a single numpy array. """ with self.session(graph=(graph or tf.Graph())) as sess: placeholders = [ tf.placeholder_with_default(v, v.shape) for v in inputs ] def wrap_graph_fn(*args, **kwargs): results = compute_fn(*args, **kwargs) if (not (isinstance(results, dict) or isinstance(results, tf.Tensor)) and hasattr(results, '__iter__')): results = list(results) return results tpu_computation = contrib_tpu.rewrite(wrap_graph_fn, placeholders) sess.run(contrib_tpu.initialize_system()) sess.run([ tf.global_variables_initializer(), tf.tables_initializer(), tf.local_variables_initializer() ]) materialized_results = sess.run(tpu_computation, feed_dict=dict( zip(placeholders, inputs))) sess.run(contrib_tpu.shutdown_system()) return self.maybe_extract_single_output(materialized_results)
def __init__(self, eval_steps, hparams): tf.logging.info("EvalLowLevelRunner: constructor") tf.logging.info("eval_steps: %s", eval_steps) self.feature_structure = {} self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.is_local = ((hparams.master == "") and (hparams.tpu_name is None)) self.eval_steps = eval_steps self.sess = None self.eval_op = None self.graph = tf.Graph() self.hparams = hparams self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = {} self.saver = None with self.graph.as_default(): self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.resolver = get_resolver(hparams) session_config = tf.ConfigProto(allow_soft_placement=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours if self.hparams.tpu_name is None: master = self.hparams.master else: cluster_spec = self.resolver.cluster_spec() if cluster_spec: session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) master = self.resolver.get_master() self.sess = tf.Session(master, graph=self.graph, config=session_config) self.sess.run(self.tpu_init)
def __init__(self, iterations, num_cores_per_shard=1, input_partition_dims=None): tf.logging.info("TrainLowLevelRunner: constructor") self.feature_structure = {} self.loss = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.iterations = iterations # TODO(wangtao): change FLAGS.num_shards_per_host to # FLAGS.num_cores_per_host after other low level API # support spatial partition. FLAGS.num_shards_per_host means number of TPU # cores for each host. self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host self.num_shards = FLAGS.num_shards self.scaffold_fn = None # Having two separate sessions and graphs to make the initialization faster. self.input_sess = None self.train_sess = None self.input_graph = tf.Graph() self.train_graph = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tpu.initialize_system() self.tpu_shutdown = tpu.shutdown_system() self.init_sess = tf.Session(self.tpu_cluster_resolver.get_master(), config=self.session_config) self.queue = Queue.Queue() # Init for spatial partitioning. self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = input_partition_dims self.use_spatial_partition = ( input_partition_dims is not None and int(np.prod(FLAGS.input_partition_dims)) > 1) self.num_cores_per_shard = num_cores_per_shard if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_shard] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_shard: %d", self.num_cores_per_shard) tf.logging.info("num_hosts: %d", self.num_hosts) tf.logging.info("replicas_per_worker: %d", self.replicas_per_worker) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_shards: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) else: self.device_assignment = None
def tf_train_flow( train_once_fn, model_dir=None, log_dir=None, max_models_keep=1, save_interval_seconds=600, save_interval_steps=1000, num_epochs=None, num_steps=None, save_model=True, save_interval_epochs=None, freeze_graph=False, num_steps_per_epoch=0, restore_from_latest=True, metric_eval_fn=None, valid_interval_epochs=0, inference_fn=None, inference_interval_epochs=0, init_fn=None, restore_fn=None, restore_include=None, restore_exclude=None, save_all_scope=False, #TODO save load from restore scope only but svae all variables_to_restore=None, variables_to_save=None, #by default will be the same as variables_to_restore output_collection_names=None, output_node_names=None, learning_rate=None, #not use yet, just use in train_once learning_rate_patience=None, learning_rate_decay_factor=None, write_during_train=True, model=None, sess=None): """ similary flow as tf_flow, but add model try reload and save """ use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ model_dir_ = model_dir if use_horovod and hvd.rank() != 0: model_dir = None if sess is None: #TODO melt.get_session is global session but may cause non close at last sess = melt.get_session() if FLAGS.use_tpu: sess.run(tpu.initialize_system()) #logging.info('tf_train_flow start') #logging.info('max_models_keep:', max_models_keep) #logging.info('save_interval_seconds:', save_interval_seconds) if model_dir: if model: checkpoint = tf.train.Checkpoint(model=model) ckpt_dir = model_dir + '/ckpt' checkpoint_prefix = os.path.join(ckpt_dir, 'ckpt') #this is usefull for you use another model with another scope, and just load and restore/save initalize your scope vars! #this is not for finetune but mainly for like using another model as in predict like this introducing graph other model scope and ignore here # var_list = None if not restore_scope else tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=restore_scope) # #logging.info('-------------var_list', var_list) # if not variables_to_restore: # variables_to_restore = var_list if not variables_to_restore: variables_to_restore = slim.get_variables_to_restore( include=restore_include, exclude=restore_exclude) if not variables_to_save: variables_to_save = variables_to_restore if save_all_scope: variables_to_save = None #if variables_to_restore is None: logging.info('variables_to_restore from %s' % model_dir) #load all var in checkpoint try to save all var(might more then original checkpoint) if not specifiy variables_to_save varnames_in_checkpoint = melt.get_checkpoint_varnames(model_dir) #logging.info('varnames_in_checkpoint: {}'.format(varnames_in_checkpoint)) # TODO has someproblem say tf.Variable 'r_net/text_encoder/cudnn_rnn/cu_dnngru/recurrent_kernel/adam_v:0' even though in checkpoint I have renated it as ignore/rnet variables_to_restore_from_model = slim.get_variables_to_restore( include=varnames_in_checkpoint) #logging.info('variables_to_restore_from_model: {}'.format(variables_to_restore_from_model)) if not variables_to_restore: variables_to_restore = variables_to_restore_from_model else: variables_to_restore = [ v for v in variables_to_restore if v in variables_to_restore_from_model ] if restore_exclude: for excl in restore_exclude: variables_to_restore = [ v for v in variables_to_restore if not excl in v.name ] #--tf 1.6 adadelta will have same vars... variables_to_restore = list(set(variables_to_restore)) #logging.info('variables_to_restore', variables_to_restore[:100]) logging.info('variables_to_restore', [ x for x in variables_to_restore if not 'OptimizeLoss' in x.name ][:100]) ##finally remove global_step since melt.apps.train will handle it! global_step = tf.train.get_or_create_global_step() #variables_to_restore = [v for v in variables_to_restore if not tf.GraphKeys.GLOBAL_STEP in v.name] #variables_to_restore = [v for v in variables_to_restore if not 'learning_rate' in v.name] # TODO fixme if step, step2.. and in checkpoint step then here will be step2... #print('------------', [v for v in variables_to_restore if 'step' in v.name]) loader = tf.train.Saver(var_list=variables_to_restore) logging.info('max models to keep {}, keep every {} hours'.format( max_models_keep, save_interval_seconds / 3600.0)) saver = tf.train.Saver( max_to_keep=max_models_keep, keep_checkpoint_every_n_hours=save_interval_seconds / 3600.0, var_list=variables_to_save) epoch_saver = tf.train.Saver(var_list=variables_to_save, max_to_keep=1000) best_epoch_saver = tf.train.Saver(var_list=variables_to_save) #logging.info('variables_to_save:{}'.format(variables_to_save)) # # #TODO for safe restore all init will be ok ? # # if variables_to_restore is None: init_op = tf.group( tf.global_variables_initializer( ), #variables_initializer(global_variables()) tf.local_variables_initializer() ) #variables_initializer(local_variables()) # # else: # # init_op = tf.group(tf.variables_initializer(variables_to_restore), # # tf.local_variables_initializer()) ##--mostly this will be fine except for using assistant predictor, initialize again! will make assistant predictor wrong ##so assume to all run init op! if using assistant predictor, make sure it use another session # https://stackoverflow.com/questions/35164529/in-tensorflow-is-there-any-way-to-just-initialize-uninitialised-variables # def guarantee_initialized_variables(session, list_of_variables = None): # if list_of_variables is None: # list_of_variables = tf.global_variables() # uninitialized_variables = list(tf.get_variable(name) for name in # session.run(tf.report_uninitialized_variables(list_of_variables))) # return unintialized_variables # unintialized_variables = guarantee_initialized_variables(sess) # init_op = tf.group(tf.initialize_variables(uninitialized_vars), tf.local_variables_initializer()) timer = gezi.Timer('sess run init_op in melt.tf_train_flow') #model.save('./weights') # notice sess.run(init_op) timer.print_elapsed() #melt.init_uninitialized_variables(sess) #pre_step means the step last saved, train without pretrained,then -1 pre_step = -1 fixed_pre_step = -1 #fixed pre step is for epoch num to be correct if you change batch size #print(model_dir) pre_epoch = None if model_dir: model_path = _get_model_path(model_dir, save_model) # if not model_path: # model_path = _get_model_path(os.path.join(model_dir, 'epoch')) #print(model_path) model_dir = gezi.get_dir( model_dir) #incase you pass ./model/model-ckpt1000 -> ./model if model_path is not None: if not restore_from_latest: logging.info('using recent but not latest model') model_path = melt.recent_checkpoint(model_dir) model_name = os.path.basename(model_path) timer = gezi.Timer( 'Loading and training from existing model [%s]' % model_path) if restore_fn is not None: restore_fn(sess) loader.restore(sess, model_path) ## not supported #model.save() #model.save_weights('./weights') timer.print() #pre_step = melt.get_model_step(model_path) - 1 if FLAGS.global_step is None else FLAGS.global_step -1 # TODO check .. pre_step = sess.run(tf.train.get_global_step()) - 1 pre_epoch = melt.get_model_epoch( model_path ) if FLAGS.global_epoch is None else FLAGS.global_epoch fixed_pre_step = pre_step # if pre_epoch is not None: # #like using batch size 32, then reload train using batch size 64 # if abs(pre_step / num_steps_per_epoch - pre_epoch) > 0.1: # fixed_pre_step = int(pre_epoch * num_steps_per_epoch) # logging.info('Warning, epoch is diff with pre_step / num_steps_per_epoch:{}, pre_epoch:{},maybe you change batch size and we will adjust to set pre_step as {}'\ # .format(pre_step / num_steps_per_epoch, pre_epoch, fixed_pre_step)) else: latest_checkpoint = None if not use_horovod: #now will hang try: latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir) if latest_checkpoint: logging.info( 'Try start from eager trained mode, latest checkpoint:', latest_checkpoint) checkpoint.restore(latest_checkpoint).run_restore_ops( session=sess) pre_epoch = int(latest_checkpoint.split('-')[-1]) #pre_step = pre_epoch * num_steps_per_epoch - 1 # TODO check pre_step = sess.run(tf.train.get_global_step()) - 1 fixed_pre_step = pre_step logging.info('Start step is:', pre_step) except Exception: logging.info( 'Something wrong with restore from eager trained model' ) if latest_checkpoint is None: logging.info('Train all start step 0') #https://stackoverflow.com/questions/40220201/tensorflow-tf-initialize-all-variables-vs-tf-initialize-local-variables #tf.initialize_all_variables() is a shortcut to tf.initialize_variables(tf.all_variables()), #tf.initialize_local_variables() is a shortcut to tf.initialize_variables(tf.local_variables()), #which initializes variables in GraphKeys.VARIABLES and GraphKeys.LOCAL_VARIABLE collections, respectively. #init_op = tf.group(tf.global_variables_initializer(), # tf.local_variables_initializer()) #[var for var in tf.all_variables() if var.op.name.startswith(restore_scope)] will be the same as tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=restore_scope) #sess.run(init_op) #like use image model, build image graph, reload first train, and then will go to same checkpoint all varaible just restore will ok #for finetune from loading other model init if init_fn is not None: init_fn(sess) if gezi.env_has('METRIC'): l = metric_eval_fn(model_path) print(list(zip(l[1], l[0]))) exit(0) #sess.run(tf.assign(global_step, tf.constant(global_step_val, dtype=tf.int64))) try: learning_rate = tf.get_collection('learning_rate')[-1] learning_rate_weight = tf.get_collection('learning_rate_weight')[-1] sess.run(tf.assign(learning_rate, learning_rate * learning_rate_weight)) except Exception: # if not using weight_decay but using optimizer decay then will go here as learning rate is a tensor can not assign pass try: logging.info('Actual start global step:', sess.run(global_step), 'learning rate:', sess.run(learning_rate), 'learning_rate_weight:', sess.run(learning_rate_weight)) except Exception: pass if model_dir_: #if save_interval_epochs and num_steps_per_epoch and num_steps >= 0: epoch_dir = os.path.join(model_dir_, 'epoch') gezi.try_mkdir(epoch_dir) checkpoint_path = os.path.join(model_dir_, 'model.ckpt') coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if use_horovod: bcast = hvd.broadcast_global_variables(0) sess.run(bcast) #tf.train.write_graph(sess.graph_def, model_dir, 'train.pbtxt') only_one_step = False try: if use_horovod: ## TODO FIXME why bcast here not work ? simple test work see tests/bcast.py #comm.bcast(pre_step, root=0) temp = np.array([pre_step, fixed_pre_step]) comm.Bcast(temp, root=0) pre_step = temp[0] fixed_pre_step = temp[1] step = start = pre_step + 1 fixed_step = fixed_pre_step + 1 #first = True #hack just for save one model after load if num_steps < 0 or (num_steps and num_steps < step): logging.info('just load and resave then exit') model_path_ = _get_checkpoint_path(checkpoint_path, step, num_steps_per_epoch, epoch=pre_epoch) saver.save(sess, model_path_, global_step=step + 1) if freeze_graph: melt.freeze_graph(sess, model_path_, step + 1, output_collection_names, output_node_names) sess.close() exit(0) if num_epochs < 0: only_one_step = True logging.info('just run one step') if FLAGS.work_mode != 'train': assert not os.path.isdir(FLAGS.model_dir), FLAGS.model_dir if 'valid' in FLAGS.work_mode: vals, names = metric_eval_fn(FLAGS.model_dir) logging.info(list(zip(names, vals))) if 'test' in FLAGS.work_mode: inference_fn(FLAGS.model_dir) exit(0) #early_stop = True #TODO allow config num_bad_epochs = 0 pre_epoch_eval_loss = 1e20 best_epoch_eval_loss = 1e20 num_allowed_bad_epochs = 4 #allow 5 non decrease eval loss epochs before stop epoch_saved_step = 0 while not coord.should_stop(): model_step_path = None if model_dir_: model_path_ = os.path.join( epoch_dir, 'model.ckpt-%.2f' % (fixed_step / float(num_steps_per_epoch))) model_step_path_ = model_path_ + '-' + str(step) if (write_during_train and metric_eval_fn is not None and valid_interval_epochs and fixed_step % int(num_steps_per_epoch * valid_interval_epochs) == 0): model_step_path = model_step_path_ else: model_step_path = None if step == 0: model_step_path = None #print('--------------------step', step) stop = train_once_fn( sess, step, is_start=(step == start), fixed_step=fixed_step, num_epochs=num_epochs, model_path=model_step_path, use_horovod=use_horovod, ## TODO FIXME this line will cause tensorflow.python.framework.errors_impl.NotFoundError: Resource localhost/save_counter/N10tensorflow3VarE does not exist. ) #first = False if only_one_step: stop = True step += 1 fixed_step += 1 if save_model and step and model_dir: #step 0 is also saved! actually train one step and save if step % save_interval_steps == 0: timer = gezi.Timer( 'save model step %d to %s' % (step, checkpoint_path), False) model_path_ = _get_checkpoint_path(checkpoint_path, fixed_step, num_steps_per_epoch) saver.save(sess, model_path_, global_step=step) if freeze_graph: melt.freeze_graph(sess, model_path_, step, output_collection_names, output_node_names) #if log_dir != model_dir: # assert log_dir # command = 'rsync -l -r -t %s/* %s' % (log_dir, model_dir) # print(command, file=sys.stderr) # os.system(command) timer.print_elapsed() if save_interval_steps and num_steps_per_epoch and fixed_step % int( num_steps_per_epoch * save_interval_epochs) == 0: # TODO only epoch in name not sep ? epoch_saved_step = step model_path_ = os.path.join( epoch_dir, 'model.ckpt-%.2f' % (fixed_step / float(num_steps_per_epoch))) model_step_path = model_path_ + '-' + str(step) epoch_saver.save(sess, model_path_, global_step=step) #epoch_saver.save(sess, model_path_) ## TODO FIXME do not support tf.keras save currently with horovod # if model: # #model.save_weights(epoch_dir + '/ckpt-%.2f' % (fixed_step / float(num_steps_per_epoch))) # # TODO FIXME if restart will save from 1... again.. # checkpoint.save(checkpoint_prefix, session=sess) # #print(sess.run(checkpoint.save_counter)) if freeze_graph: melt.freeze_graph(sess, model_path_, step, output_collection_names, output_node_names) if write_during_train: if inference_fn is not None and inference_interval_epochs and fixed_step % int( num_steps_per_epoch * inference_interval_epochs) == 0: model_step_path = model_path_ + '-' + str(step) try: #print('--------------inference fn') inference_fn(model_path=model_step_path) except Exception: logging.info(traceback.format_exc()) # if metric_eval_fn is not None and valid_interval_epochs and fixed_step % int(num_steps_per_epoch * valid_interval_epochs) == 0: # model_step_path = model_path_ + '-' + str(step) # try: # metric_eval_fn(model_path=model_step_path) # except Exception: # logging.info(traceback.format_exc()) if stop is True: print('Early stop running %d stpes' % (step), file=sys.stderr) raise tf.errors.OutOfRangeError( None, None, 'Early stop running %d stpes' % (step)) if num_steps and (step + 1) == start + num_steps: raise tf.errors.OutOfRangeError(None, None, 'Reached max num steps') #max_num_epochs = 1000 max_num_epochs = num_epochs #if max_num_epochs and num_steps_per_epoch and fixed_step // num_steps_per_epoch >= max_num_epochs: if max_num_epochs and num_steps_per_epoch and fixed_step / num_steps_per_epoch > max_num_epochs: raise tf.errors.OutOfRangeError( None, None, 'Reached max num epochs of %d' % max_num_epochs) #except tf.errors.OutOfRangeError, e: except tf.errors.OutOfRangeError: # if run 2 epoch and we have just epoch saved, do not need to save only 1 step more model if (step - epoch_saved_step > 1) and not ( step == start ) and save_model and step % save_interval_steps != 0 and model_dir: model_path_ = _get_checkpoint_path(checkpoint_path, step, num_steps_per_epoch) saver.save(sess, model_path_, global_step=step) if freeze_graph: melt.freeze_graph(sess, model_path_, step, output_collection_names, output_node_names) if log_dir != model_dir: assert log_dir command = 'rsync -l -r -t %s/* %s' % (log_dir, model_dir) print(command, file=sys.stderr) os.system(command) if only_one_step: logging.info('Done one step') exit(0) # if (step - epoch_saved_step > 1) and metric_eval_fn is not None: # metric_eval_fn(model_path=model_step_path) if (num_epochs and fixed_step / num_steps_per_epoch >= num_epochs) or ( num_steps and step == start + num_steps): logging.info('Done training for %.3f epochs, %d steps.' % (fixed_step / num_steps_per_epoch, step)) #FIXME becase coord.join seems not work, RuntimeError: Coordinator stopped with threads still running: Thread-9 exit(0) else: logging.info('Should not stop, but stopped at epoch: %.3f' % (fixed_step / num_steps_per_epoch)) logging.info(traceback.format_exc()) #raise e finally: coord.request_stop() coord.join(threads, stop_grace_period_secs=5) #FIMXE due to use melt.get_session(global not handle del well) #Done training for 3090020 steps. #Exception TypeError: "'NoneType' object is not callable" in <bound method Session.__del__ of <tensorflow.python.client.session.Session object at 0x7f6cf33cd450>> ignored if FLAGS.use_tpu: sess.run(tpu.shutdown_system()) sess.close()
def _shutdown(): with tf.Session(target=FLAGS.master, config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(tpu.shutdown_system())
def __init__(self, coordinator, index, tpu_name, num_cores, cfg, iterations, train_steps, eval_steps): tf.logging.info("SwarmRunner: constructor") iterations = cfg['iterations_per_loop'] train_steps = cfg['train_steps'] eval_steps = cfg['steps_per_eval'] self.coordinator = coordinator self.index = index self.cfg = cfg self.tpu_name = tpu_name self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.eval_loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.num_cores = num_cores self.num_hosts = num_cores // FLAGS.tpu_cores_per_host self.dequeue_ops = [] self.queue = Queue.Queue() self.eval_enqueue_ops = [] self.dataset_initializer = [] self.eval_dataset_initializer = [] self.iterations = iterations self.steps_per_epoch = FLAGS.num_train_images // self.cfg[ 'train_batch_size'] self.iterator = None self.sess = None self.saver = None self.checkpoint_thread = None self.input_sess = None self.eval_input_sess = None self.eval_output_sess = None self.log_sess = None self.infeed_thread = None self.train_eval_thread = None self.flush_summaries_thread = None self.graph = tf.Graph() self.init_graph = tf.Graph() self.input_graph = tf.Graph() self.eval_input_graph = tf.Graph() self.eval_output_graph = tf.Graph() self.log_graph = tf.Graph() if train_steps % iterations != 0: train_steps = iterations * int(math.ceil(train_steps / iterations)) self.train_steps = train_steps #self.max_train_iterations = self.train_steps // iterations # TKTK self.max_train_iterations = 1 self.eval_steps = int(eval_steps) self.eval_batch_size = self.cfg['eval_batch_size'] with self.init_graph.as_default(): tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.tpu_cluster_resolver = TPUClusterResolver( tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) self.config = tf.ConfigProto( operation_timeout_in_ms=600 * 60 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) # share resource variables across sessions self.config.experimental.share_session_state_in_clusterspec_propagation = True cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) self.master = self.tpu_cluster_resolver.get_master() tf.logging.info("Initializing TPU...") self.init_sess = tflex.Session(self.master, config=self.config, graph=self.init_graph) self.init_sess.run(tpu_init) tf.logging.info("Initializing TPU... (done)")
validation_data=validation_generator, validation_steps=50) # 結果の保存 #model.save('cat_adn_dogs_small_1_tpu.h5') with open("history_tpu.pickle", mode='wb') as fp: pickle.dump(history.history, fp) acc = history.history['acc'] val_acc = history.history['val_acc'] loss = history.history['loss'] val_loss = history.history['val_loss'] def dummy(): epochs = range(1, len(acc) + 1) plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and Validation accuracy') plt.legend() plt.savefig('plot1_tpu.png') plt.plot(epochs, loss, 'bo', label='Training logg') plt.plot(epochs, val_loss, 'b', label='Validation acc') plt.title('Training and Validation loss') plt.legend() plt.savefig('plot2_tpu.png') tpu.shutdown_system()
def flops(): x = tf.random_uniform([N, N]) y = tf.random_uniform([N, N]) def _matmul(x, y): return tf.tensordot(x, y, axes=[[1], [0]]), y return tf.reduce_sum(tpu.repeat(COUNT, _matmul, [x, y])) tpu_ops = tpu.batch_parallel(flops, [], num_shards=8) session = tf.Session(tpu_cluster) try: print('Warming up...') session.run(tpu.initialize_system()) session.run(tpu_ops) print('Profiling') start = time.time() session.run(tpu_ops) end = time.time() elapsed = end - start print(elapsed, 'TFlops: {:.2f}'.format(1e-12 * 8 * COUNT * 2 * N * N * N / elapsed)) except Exception as e: print(e) finally: session.run(tpu.shutdown_system()) session.close()
def get_finalize_ops(self): return [tpu.shutdown_system()]
def run_monitored_session(cross_entropy, log_dir, required_steps, class_range, save_checkpoint_steps, validation_steps, train_step, augmentation_info, device, training_nn_params, training_tensor, testing_nn_params, testing_tensor, validation_nn_params, validation_tensor): read_op_value = None augmentation_restorer = None if augmentation_info.perform_shadow_augmentation: if augmentation_info.shadow_struct is not None or augmentation_info.shadow_struct.shadow_op_initializer is not None: augmentation_restorer = augmentation_info.shadow_struct.shadow_op_creater() # Ready ops are overriden, as default ready ops awaits all variables to be initialized # but actually some of the variables(such as cycle-gan graphs) are not initialized but restored read_op_value = constant([]) is_gpu_or_cpu = (device == "gpu" or device == "cpu") if is_gpu_or_cpu: config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) set_all_gpu_config() master = '' else: config = None tpu_worker = 'grpc://' + os.environ['COLAB_TPU_ADDR'] # master = TPUClusterResolver(tpu=tpu_worker).get_master() master = tpu_worker print("TPU master") print(master) validation_hook = ValidationHook(validation_nn_params, validation_tensor, class_range, required_steps, validation_steps, log_dir) test_iteration_count = 100 test_hook = TestHook(testing_nn_params, testing_tensor, cross_entropy, test_iteration_count, class_range) initializer_hook = InitializerHook(training_nn_params, training_tensor, augmentation_info, augmentation_restorer) stop_on_step_hook = StopAtStepHook(last_step=required_steps - 1) nan_tensor_hook = NanTensorHook(loss_tensor=cross_entropy, fail_on_nan_loss=False) hooks = [initializer_hook, validation_hook, test_hook, stop_on_step_hook, nan_tensor_hook] if is_gpu_or_cpu: # Only restore nn core variables along with the optimizer and global step variables nn_core_restorer = tf.train.Saver( max_to_keep=20, var_list=slim.get_variables_to_restore(include=["nn_core"]) + slim.get_variables_to_restore(include=["global_step"]) + slim.get_variables_to_restore(include=["training_optimizer"]), name="nn_core_restorer") training_scaffold = Scaffold(saver=nn_core_restorer, ready_for_local_init_op=read_op_value, ready_op=read_op_value) session = tf.train.MonitoredTrainingSession(master=master, checkpoint_dir=log_dir, summary_dir=log_dir, config=config, is_chief=True, save_summaries_steps=test_iteration_count, save_checkpoint_steps=save_checkpoint_steps, scaffold=training_scaffold, hooks=hooks) # session = LocalCLIDebugWrapperSession(session) with session as monitored_sess: while not monitored_sess.should_stop(): monitored_sess.run([train_step]) else: session = tf.Session(target=master, config=config) session.run(tpu.initialize_system()) session.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) initializer_hook.after_create_session(session, None) while session.run(test_hook._global_step_tensor) < required_steps: try: session.run(train_step) test_hook.after_run_with_session(session) except tf.errors.OutOfRangeError: break validation_hook.end(session) session.run(tpu.shutdown_system()) session.close() result = TrainingResult(validation_accuracy=validation_hook.validation_accuracy, test_accuracy=test_hook.testing_accuracy, loss=test_hook.loss) return result
'TPU IS NOT ENABLED (pass a TPU name or grpc://ip:port as the TPU_NAME envvar)' ) computation = tf.py_func(axy_computation, inputs, tf.float32) tpu_grpc_url = None with tf.Session(tpu_grpc_url) as sess: if use_tpu: print('Running TPU initializer') sess.run(tpu.initialize_system()) sess.run(tf.global_variables_initializer()) print('Running computation {}'.format(computation)) output = sess.run(computation) print(output) if not use_tpu: # For whichever reason, we can't do this in the TPU environment... output_var = tf.get_variable('output', output_shape) sess.run(tf.assign(output_var, output)) save_path = tf.train.Saver().save(sess, output_dir + '/model.ckpt') print('Saved model to: {}'.format(save_path)) with open(output_dir + '/output.txt', 'w') as outf: outf.write(repr(output)) print('Saved output data to: {}'.format(outf.name)) if use_tpu: print('Shutting down TPU') sess.run(tpu.shutdown_system()) print('Done!')
def handler(queue, kvm_fd, mm): global object_dict global object_id global callback_stack global initialized if not initialized: callback_stack = [] object_dict = dict() object_id = 1 # TODO: forward logging or disable it in test tf.logging.set_verbosity(tf.logging.INFO) initialized = True print("handler is initialized") while True: task = None task = queue.get(block=True) while task is None: try: task = queue.get(block=True, timeout=5) except Queue.Empty: task = None if callback_stack: if time.time() > callback_stack[-1]["deadline"]: print("callback failed deadline") return STATUS_CALLBACK_TIMEOUT vm_id = task.vm_id if vm_id == STOP_HANDLER: break param = TF_PY_PARAM.from_buffer(mm, task.data_ptr) callback_param = TF_PY_PARAM.from_buffer( mm, task.data_ptr + param.base.callback_param_offset) print( "retrieve [vm#%d] tensorflow task=%d cmd=%d, obj=%d, dstore=%lx, done=%d" % (task.vm_id, task.node_id, param.base.cmd_id, param.base.object_id, param.base.dstore_size, param.base.done)) print( "retrieve [vm#%d] callback node cmd=%d, obj=%d, dstore=%lx, done=%d" % (task.vm_id, callback_param.base.cmd_id, callback_param.base.object_id, callback_param.base.dstore_size, callback_param.base.done)) cmd_id = param.base.cmd_id try: if cmd_id == TF_PY_NW_CALLBACK_DONE: param.base.done = STATUS_TASK_DONE ret = fcntl.ioctl(kvm_fd, IOCTL_KVM_NOTIFY_TASK_FINISHED, task.node_id) if ret < 0: print("notify task completion failed: %d\n" % ret) if callback_stack and \ callback_stack[-1]["callback_id"] == param.base.object_id: print("callback is finished") return STATUS_CALLBACK_DONE else: print("callback is error") return STATUS_CALLBACK_ERROR if cmd_id == TF_PY_SESSION_INIT: print("SessionInit!!!") param1 = parse_param(vm_id, mm, param, param.param1) print(param1) sess = tf.Session(param1) # assign object_id object_dict[object_id] = sess param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_SESSION_ENTER: sess = object_dict[param.base.object_id] ctx_sess = sess.__enter__() if sess is ctx_sess: pass else: # unlikely print("unlikely to search for sess") param.base.object_id = next( obj_id for obj_id, obj in object_dict.items() if obj is ctx_sess) elif cmd_id == TF_PY_SESSION_EXIT: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) sess = object_dict[param.base.object_id] sess.__exit__(param1, param2, param3) elif cmd_id == TF_PY_SESSION_DEL: sess = object_dict[param.base.object_id] sess.__del__() # deprecated elif cmd_id == TF_PY_SESSION_RUN: sess = object_dict[param.base.object_id] param1 = parse_param(vm_id, mm, param, param.param1) if type(param1) == NwObject: print("get NwObject=%d" % param1.object_id()) param1 = object_dict[param1.object_id()] print(param1) ret_val = sess.run(param1) print(ret_val) writeback_result(vm_id, mm, param, param.ret_val1, ret_val) elif cmd_id == TF_PY_TPU_CLUSTER_RESOLVER_INIT: print("resloverInit!!!") param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) if param1 is None: param1 = None if param2 is None: param2 = None if param3 is None: param3 = None print("TPUClusterResolver", param1, param2, param3) tpu_grpc = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=param1, zone=param2, project=param3) # assign object_id object_dict[object_id] = tpu_grpc param.base.object_id = object_id print("assign obj_id=%d" % object_id) object_id += 1 # deprecated elif cmd_id == TF_PY_TPU_CLUSTER_RESOLVER_MASTER: # FIXED: use __getattr__ print("master!!") tpu_grpc = object_dict[param.base.object_id] # FIXED: may have parameters tpu_grpc_url = tpu_grpc.master() # serialize return value writeback_result(vm_id, mm, param, param.ret_val1, tpu_grpc_url) elif cmd_id == TF_PY_TPU_INITIALIZE_SYSTEM: # TODO: may have parameters ts = tpu.initialize_system() object_dict[object_id] = ts param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_TPU_SHUTDOWN_SYSTEM: # TODO: may have parameters ts = tpu.shutdown_system() object_dict[object_id] = ts param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_GLOBAL_VARIABLES_INITIALIZER: # TODO: may have parameters ts = tf.global_variables_initializer() object_dict[object_id] = ts param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_ONES: print("param1 size=%ld,offset=%ld" % (param.param1.size, param.param1.offset)) param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) if param2 is None: param2 = dtypes.float32 print(param2) var = tf.ones(param1, param2) object_dict[object_id] = var param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_RANDOM_UNIFORM: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param5 = parse_param(vm_id, mm, param, param.param5) param6 = parse_param(vm_id, mm, param, param.param6) if param2 is None: param2 = 0 if param4 is None: param4 = dtypes.float32 print(param1, param2, param3, param4) var = tf.random_uniform(param1, param2, param3, param4, param5, param6) object_dict[object_id] = var param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_TRANSPOSE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param1 = object_dict[param1.object_id()] if param3 is None: param3 = "transpose" if param4 is None: param4 = False print("transpose", param1, param2, param3, param4) var = tf.transpose(param1, param2, param3, param4) object_dict[object_id] = var param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_CAST: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param1 = object_dict[param1.object_id()] print("cast", param1, param2, param3) var = tf.cast(param1, param2, param3) object_dict[object_id] = var param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_EXPAND_DIMS: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param1 = object_dict[param1.object_id()] print("expand_dims", param1, param2, param3, param4) var = tf.expand_dims(param1, param2, param3, param4) object_dict[object_id] = var param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_CONCAT: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param1 = object_dict[param1.object_id()] if param3 is None: param3 = "concat" print("concat", param1, param2, param3) var = tf.concat(param1, param2, param3) object_dict[object_id] = var param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_EQUAL: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param1 = object_dict[param1.object_id()] print("equal", param1, param2, param3) if isinstance(param2, NwObject): param2 = object_dict[param2.object_id()] result = tf.equal(param1, param2, param3) print(result) object_dict[object_id] = result param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_FIXED_LEN_FEATURE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) feature = tf.FixedLenFeature(param1, param2, param3) print(feature) object_dict[object_id] = feature param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_VAR_LEN_FEATURE: param1 = parse_param(vm_id, mm, param, param.param1) feature = tf.VarLenFeature(param1) print(feature) object_dict[object_id] = feature param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_PARSE_SINGLE_EXAMPLE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) print(param1, param2) # expand embedded NwObject if isinstance(param1, NwObject): param1 = object_dict[param1.object_id()] dict_walker(param2) print("after translation", param1, param2) result = tf.parse_single_example(param1, param2, param3, param4) print(result) dict_mapper(result) print(result) writeback_result(vm_id, mm, param, param.ret_val1, result) elif cmd_id == TF_PY_CONTROL_FLOW_OPS_SWITCH: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param1 = object_dict[param1.object_id()] param2 = object_dict[param2.object_id()] print("switch", param1, param2, param3, param4) result = control_flow_ops.switch(param1, param2, param3, param4) print(result) mapped_tuple = tuple_mapper(result, [0, 1]) print(mapped_tuple) writeback_result(vm_id, mm, param, param.ret_val1, mapped_tuple) elif cmd_id == TF_PY_CONTROL_FLOW_OPS_MERGE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param1 = object_dict[param1.object_id()] print("merge", param1, param2) list_walker(param1) print("merge-new", param1, param2) result = control_flow_ops.merge(param1, param2) print(result) mapped_tuple = tuple_mapper(result, [0]) print(mapped_tuple) writeback_result(vm_id, mm, param, param.ret_val1, mapped_tuple) elif cmd_id == TF_PY_TPU_REWRITE: # TODO: may have parameters param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) # default parameter if param2 is None: param2 = None # expand embedded NwObject list_walker(param2) func = tpu.rewrite(param1, param2) object_dict[object_id] = func param.base.object_id = object_id print("rewrite object_id=%d" % object_id) object_id += 1 elif cmd_id == TF_PY_TPU_RUN_CONFIG: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param5 = parse_param(vm_id, mm, param, param.param5) # default parameter if param1 is None: param1 = None if param2 is None: param2 = None if param3 is None: param3 = None if param4 is None: param4 = None # expand embedded NwObject param4 = object_dict[param4.object_id()] print(param4, param5) func = tpu.RunConfig(param1, param2, param3, param4, **param5) object_dict[object_id] = func param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_TPU_TPU_ESTIMATOR: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param5 = parse_param(vm_id, mm, param, param.param5) param6 = parse_param(vm_id, mm, param, param.param6) param7 = parse_param(vm_id, mm, param, param.param7) param8 = parse_param(vm_id, mm, param, param.param8) param9 = parse_param(vm_id, mm, param, param.param9) param10 = parse_param(vm_id, mm, param, param.param10) param11 = parse_param(vm_id, mm, param, param.param11) param12 = parse_param(vm_id, mm, param, param.param12) # default parameter if param1 is None: param1 = None if param2 is None: param2 = None if param3 is None: param3 = None if param4 is None: param4 = None if param5 is None: param5 = True if param6 is None: param6 = None if param7 is None: param7 = None if param8 is None: param8 = None if param9 is None: param9 = None if param10 is None: param10 = True if param11 is None: param11 = True if param12 is None: param12 = None # expand embedded NwObject param3 = object_dict[param3.object_id()] print(param3) func = tpu.TPUEstimator(param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12) object_dict[object_id] = func param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_IMAGE_RESIZE_IMAGES: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param5 = parse_param(vm_id, mm, param, param.param5) # default parameter if param3 is None: param3 = ResizeMethod.BILINEAR if param4 is None: param4 = False if param5 is None: param5 = False # expand embedded NwObject param1 = object_dict[param1.object_id()] print(param1) img = tf.image.resize_images(param1, param2, param3, param4, param5) # TODO: it may return a float object_dict[object_id] = img param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_SLICE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) # expand embedded NwObject print(param1, param2, param3) param1 = object_dict[param1.object_id()] ret = tf.slice(param1, param2, param3, param4) object_dict[object_id] = ret param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_SHAPE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) if param3 is None: param3 = dtypes.int32 # expand embedded NwObject print(param1, param2, param3) param1 = object_dict[param1.object_id()] ret = tf.shape(param1, param2, param3) object_dict[object_id] = ret param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_IMAGE_SAMPLE_DISTORTED_BOUNDING_BOX: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param5 = parse_param(vm_id, mm, param, param.param5) param6 = parse_param(vm_id, mm, param, param.param6) param7 = parse_param(vm_id, mm, param, param.param7) param8 = parse_param(vm_id, mm, param, param.param8) param9 = parse_param(vm_id, mm, param, param.param9) param10 = parse_param(vm_id, mm, param, param.param10) # default parameter if param5 is None: param5 = 0.1 print("sample_distorted_bounding_box", param1, param2) result = tf.image.sample_distorted_bounding_box( param1, param2, param3, param4, param5, param6, param7, param8, param9, param10) print(result) mapped_tuple = tuple_mapper(result, [0, 1, 2]) print(mapped_tuple) writeback_result(vm_id, mm, param, param.ret_val1, mapped_tuple) elif cmd_id == TF_PY_IMAGE_DRAW_BOUNDING_BOXES: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) # expand embedded NwObject print(param1, param2, param3) param1 = object_dict[param1.object_id()] param2 = object_dict[param2.object_id()] ret = tf.image.draw_bounding_boxes(param1, param2, param3) object_dict[object_id] = ret param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_IMAGE_DECODE_JPEG: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) param5 = parse_param(vm_id, mm, param, param.param5) param6 = parse_param(vm_id, mm, param, param.param6) param7 = parse_param(vm_id, mm, param, param.param7) param8 = parse_param(vm_id, mm, param, param.param8) if param2 is None: param2 = 0 if param3 is None: param3 = 1 if param4 is None: param4 = True if param5 is None: param5 = False if param6 is None: param6 = 1 if param7 is None: param7 = "" param1 = object_dict[param1.object_id()] img = tf.image.decode_jpeg(param1, param2, param3, param4, param5, param6, param7, param8) object_dict[object_id] = img param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_IMAGE_CONVERT_IMAGE_DTYPE: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) param4 = parse_param(vm_id, mm, param, param.param4) # expand embedded NwObject print(param1, param2, param3) param1 = object_dict[param1.object_id()] if param3 is None: param3 = False ret = tf.image.convert_image_dtype(param1, param2, param3, param4) object_dict[object_id] = ret param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_DATA_DATASET_LIST_FILES: param1 = parse_param(vm_id, mm, param, param.param1) param2 = parse_param(vm_id, mm, param, param.param2) param3 = parse_param(vm_id, mm, param, param.param3) print(param1, param2, param3) if isinstance(param1, NwObject): param1 = object_dict[oaram1.object_id()] ret = tf.data.Dataset.list_files(param1, param2, param3) object_dict[object_id] = ret param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_NW_OBJECT: print("nw_object!! id = %d" % param.base.object_id) obj = object_dict[param.base.object_id] name = parse_param(vm_id, mm, param, param.param1) args = parse_param(vm_id, mm, param, param.param2) kwargs = parse_param(vm_id, mm, param, param.param3) print("NwObject", obj, name, args, kwargs) # expand embedded NwObject args = list(args) list_walker(args) args = tuple(args) dict_walker(kwargs) print("after translation", obj, name, args, kwargs) # run result = getattr(obj, name)(*(args or []), **(kwargs or {})) param.base.object_id = -1 param.ret_val1.size = 0 print("analyze type", type(result), result) # TODO: go through tuple, dict or list if isinstance(result, tuple): result = tuple_mapper(result, range(len(result))) if isinstance(result, dict): dict_mapper(result) if isinstance(result, list): list_mapper(result) # serialize return value if is_unpickleable_type(result) or \ pickle.pickles(result) is False: object_dict[object_id] = result param.base.object_id = object_id object_id += 1 elif result is not None: writeback_result(vm_id, mm, param, param.ret_val1, result) elif cmd_id == TF_PY_NW_METHOD: # Reuse as callback #ins = parse_param(vm_id, mm, param, param.param1) #name = parse_param(vm_id, mm, param, param.param2) #print(ins, name) #method = getattr(ins, name) #print(method) #object_dict[object_id] = method cw = callback_constructor(object_id, callback_param, param, mm, vm_id, queue, kvm_fd) object_dict[object_id] = cw param.base.object_id = object_id object_id += 1 elif cmd_id == TF_PY_NW_CALLBACK_TEST: nw_func = parse_param(vm_id, mm, param, param.param1) print(nw_func, nw_func.object_id()) func = object_dict[nw_func.object_id()] print("callback func", func) x = parse_param(vm_id, mm, param, param.param2) y = parse_param(vm_id, mm, param, param.param3) result = func(x, y) print(result) writeback_result(vm_id, mm, param, param.ret_val1, result) else: print("unsupported Tensorflow API") except Exception, error: param.base.done = STATUS_TASK_ERROR #mm.flush(task.data_ptr, sizeof(PARAM_BASE)) print "fault: ", str(error) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) traceback.print_stack() print("finished [vm#%d] TF task %d cmd %d" % (task.vm_id, task.node_id, param.base.cmd_id)) param.base.done = STATUS_TASK_DONE #mm.flush(task.data_ptr, sizeof(PARAM_BASE)) #mm.flush(INVOKER_FIFO_SIZE + VGPU_DSTORE_SIZE * (vm_id - 1) + # param.base.dstore_offset + param.ret_val1.offset, # param.ret_val1.size) # notify hypervisor ret = fcntl.ioctl(kvm_fd, IOCTL_KVM_NOTIFY_TASK_FINISHED, task.node_id) if ret < 0: print("notify task completion failed: %d\n" % ret)