def testBasicNcclReduce(self): inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1], [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]] expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2] group_size = len(inputs) group_key = 1 instance_key = 1 # Configure virtual GPU devices device_type = 'GPU' virtual_devices = [ config_pb2.GPUOptions.Experimental.VirtualDevices( memory_limit_mb=([1 << 10] * group_size)) ] # 1 GB per virtual GPU gpu_options = config_pb2.GPUOptions( visible_device_list='0', experimental=config_pb2.GPUOptions.Experimental( virtual_devices=virtual_devices)) # Configure NCCL experimental = config_pb2.ConfigProto.Experimental( collective_nccl=True) os.environ['NCCL_DEBUG'] = 'INFO' os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL' config = config_pb2.ConfigProto(gpu_options=gpu_options, experimental=experimental) devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)] with self.session(config=config) as sess: if not test_util.is_gpu_available(cuda_only=True): self.skipTest('No GPU available') colred = [] for i in range(group_size): with ops.device(devices[i]): tensor = constant_op.constant(inputs[i]) colred.append( collective_ops.all_reduce(tensor, group_size, group_key, instance_key, 'Add', 'Div')) run_options = config_pb2.RunOptions() results = sess.run(colred, options=run_options) for i in range(group_size): self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
def testV1Compatibility(self): # Ensure we set 1 CPU by default context.context()._config = config_pb2.ConfigProto() new_config = context.context().config self.assertEqual(new_config.device_count['CPU'], 1) context.context()._physical_devices = None # Ensure CPU is split context.context()._config = config_pb2.ConfigProto(device_count={'CPU': 2},) new_config = context.context().config self.assertEqual(new_config.device_count['CPU'], 2) context.context()._physical_devices = None # Ensure Handle visible device list parsing context.context()._config = config_pb2.ConfigProto( gpu_options=config_pb2.GPUOptions(visible_device_list='',),) gpus = config.list_physical_devices('GPU') new_config = context.context().config self.assertEqual(new_config.gpu_options.visible_device_list, ','.join(str(i) for i in range(len(gpus)))) context.context()._physical_devices = None
def testSetConfiguration(self): config = config_pb2.ConfigProto( gpu_options=config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.1)) # Configure a server using the default local server options. server = server_lib.Server.create_local_server(config=config, start=False) self.assertEqual(0.1, server.server_def.default_session_config.gpu_options. per_process_gpu_memory_fraction) # Configure a server using an explicit ServerDefd with an # overridden config. cluster_def = server_lib.ClusterSpec({ "localhost": ["localhost:0"] }).as_cluster_def() server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_def, job_name="localhost", task_index=0, protocol="grpc") server = server_lib.Server(server_def, config=config, start=False) self.assertEqual(0.1, server.server_def.default_session_config.gpu_options. per_process_gpu_memory_fraction)
def _GetConfigProto(self, run_params, graph_state): """Get config proto based on specific settings.""" if graph_state != GraphState.ORIGINAL and run_params.use_optimizer: trt_params = self.GetConversionParams(run_params) rewriter_cfg = trt_convert.tensorrt_rewriter_config( trt_params.max_batch_size, trt_params.max_workspace_size_bytes, trt_params.precision_mode, trt_params.minimum_segment_size, trt_params.is_dynamic_op, trt_params.maximum_cached_engines, trt_params.cached_engine_batch_sizes) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg) else: graph_options = config_pb2.GraphOptions() gpu_options = config_pb2.GPUOptions() gpu_options.allow_growth = True if trt_convert.get_linked_tensorrt_version()[0] == 3: gpu_options.per_process_gpu_memory_fraction = 0.50 config = config_pb2.ConfigProto( gpu_options=gpu_options, graph_options=graph_options) return config
def _GetConfigProto(self, run_params, graph_state): """Get config proto based on specific settings.""" if graph_state != GraphState.ORIGINAL and run_params.use_optimizer: rewriter_cfg = rewriter_config_pb2.RewriterConfig() rewriter_cfg.optimizers.extend(["constfold", "layout"]) custom_op = rewriter_cfg.custom_optimizers.add() custom_op.name = "TensorRTOptimizer" trt_params = self.GetConversionParams(run_params) custom_op.parameter_map[ "max_batch_size"].i = trt_params.max_batch_size custom_op.parameter_map["max_workspace_size_bytes"].i = ( trt_params.max_workspace_size_bytes) custom_op.parameter_map[ "precision_mode"].s = trt_params.precision_mode custom_op.parameter_map["minimum_segment_size"].i = ( trt_params.minimum_segment_size) custom_op.parameter_map[ "is_dynamic_op"].b = trt_params.is_dynamic_op custom_op.parameter_map["maximum_cached_engines"].i = ( trt_params.maximum_cached_engines) if trt_params.cached_engine_batches: custom_op.parameter_map["cached_engine_batches"].list.i.extend( trt_params.cached_engine_batches) graph_options = config_pb2.GraphOptions( rewrite_options=rewriter_cfg) else: graph_options = config_pb2.GraphOptions() gpu_options = config_pb2.GPUOptions() gpu_options.allow_growth = True if trt_convert.get_linked_tensorrt_version()[0] == 3: gpu_options.per_process_gpu_memory_fraction = 0.50 config = config_pb2.ConfigProto(gpu_options=gpu_options, graph_options=graph_options) return config
def __init__(self, target='', graph=None, config=None): """Creates a new interactive TensorFlow session. If no `graph` argument is specified when constructing the session, the default graph will be launched in the session. If you are using more than one graph (created with `tf.Graph()` in the same process, you will have to use different sessions for each graph, but each graph can be used in multiple sessions. In this case, it is often clearer to pass the graph to be launched explicitly to the session constructor. Args: target: (Optional.) The execution engine to connect to. Defaults to using an in-process engine. graph: (Optional.) The `Graph` to be launched (described above). config: (Optional) `ConfigProto` proto used to configure the session. """ if not config: # If config is not provided, choose some reasonable defaults for # interactive use: # # - Grow GPU memory as needed at the cost of fragmentation. gpu_options = config_pb2.GPUOptions(allow_growth=True) config = config_pb2.ConfigProto(gpu_options=gpu_options) # Interactive sessions always place pruned graphs. config.graph_options.place_pruned_graph = True super(InteractiveSession, self).__init__(target, graph, config) self._default_session = self.as_default() self._default_session.enforce_nesting = False self._default_session.__enter__() self._explicit_graph = graph if self._explicit_graph is not None: self._default_graph = graph.as_default() self._default_graph.enforce_nesting = False self._default_graph.__enter__()
def auto(multi_engine): """Run the conversion as an optimization pass.""" if multi_engine: inp_dims = (2, 3, 7, 5) orig_graph = get_multi_engine_graph_def() else: inp_dims = (100, 24, 24, 2) orig_graph = get_simple_graph_def() # use a frozen graph for inference dummy_input = np.random.random_sample(inp_dims) opt_config = rwpb2.RewriterConfig() opt_config.meta_optimizer_iterations = opt_config.ONE opt_config.optimizers.extend(["constfold", "layout"]) custom_op = opt_config.custom_optimizers.add() custom_op.name = "TensorRTOptimizer" custom_op.parameter_map["minimum_segment_size"].i = 3 custom_op.parameter_map["precision_mode"].s = to_bytes("FP32") custom_op.parameter_map["max_batch_size"].i = inp_dims[0] custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25 print(custom_op) gpu_options = None if trt.trt_convert.get_linked_tensorrt_version()[0] == 3: gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) graph_options = cpb2.GraphOptions(rewrite_options=opt_config) sessconfig = cpb2.ConfigProto(gpu_options=gpu_options, graph_options=graph_options) print(sessconfig) g = ops.Graph() ops.reset_default_graph() with g.as_default(): inp, out = importer.import_graph_def( graph_def=orig_graph, return_elements=["input", "output"], name="") inp = inp.outputs[0] out = out.outputs[0] with csess.Session(config=sessconfig, graph=g) as sess: val = sess.run(out, {inp: dummy_input}) print(val.shape)
def setUpClass(cls): gpu_memory_fraction_opt = ("--gpu_memory_fraction=%f" % cls.PER_PROC_GPU_MEMORY_FRACTION) worker_port = portpicker.pick_unused_port() cluster_spec = "worker|localhost:%d" % worker_port tf_logging.info("cluster_spec: %s", cluster_spec) server_bin = test.test_src_dir_path( "python/debug/grpc_tensorflow_server.par") cls.server_target = "grpc://localhost:%d" % worker_port cls.server_procs = {} cls.server_procs["worker"] = subprocess.Popen([ server_bin, "--logtostderr", "--cluster_spec=%s" % cluster_spec, "--job_name=worker", "--task_id=0", gpu_memory_fraction_opt, ], stdout=sys.stdout, stderr=sys.stderr) # Start debug server in-process, on separate thread. (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread, cls.debug_server ) = grpc_debug_test_server.start_server_on_separate_thread( dump_to_filesystem=False) tf_logging.info("debug server url: %s", cls.debug_server_url) cls.session_config = config_pb2.ConfigProto( gpu_options=config_pb2.GPUOptions( per_process_gpu_memory_fraction=cls. PER_PROC_GPU_MEMORY_FRACTION))
def _GetConfigProto(self): """Get ConfigProto for session creation.""" config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions( allow_growth=True)) return config
from tensorflow_recommenders_addons import dynamic_embedding as de from tensorflow.core.protobuf import config_pb2 from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.training import adam from tensorflow.python.training import monitored_session from tensorflow.python.training import training_util default_config = config_pb2.ConfigProto( allow_soft_placement=True, gpu_options=config_pb2.GPUOptions(allow_growth=True)) class HorovodTest(test.TestCase): @test_util.deprecated_graph_mode_only def test_adam_minimize_trainable(self): base_opt = adam.AdamOptimizer(1.0) test_opt = adam.AdamOptimizer(1.0) self.common_minimize_trainable(base_opt, test_opt, name="adam") def common_minimize_trainable(self, base_opt, test_opt, name): from tensorflow.python.framework.errors_impl import NotFoundError # TODO(rhdong): Recover the testing, if the horovod import error is fixed on macOS+TF2.7+. try: import horovod.tensorflow as hvd
def _GetGPUOptions(self): gpu_options = config_pb2.GPUOptions() gpu_options.allow_growth = True return gpu_options
def _poll_server_till_success(max_attempts, sleep_per_poll_sec, debug_server_url, dump_dir, server, gpu_memory_fraction=1.0): """Poll server until success or exceeding max polling count. Args: max_attempts: (int) How many times to poll at maximum sleep_per_poll_sec: (float) How many seconds to sleep for after each unsuccessful poll. debug_server_url: (str) gRPC URL to the debug server. dump_dir: (str) Dump directory to look for files in. If None, will directly check data from the server object. server: The server object. gpu_memory_fraction: (float) Fraction of GPU memory to be allocated for the Session used in server polling. Returns: (bool) Whether the polling succeeded within max_polls attempts. """ poll_count = 0 config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction)) with session.Session(config=config) as sess: for poll_count in range(max_attempts): server.clear_data() print("Polling: poll_count = %d" % poll_count) x_init_name = "x_init_%d" % poll_count x_init = constant_op.constant([42.0], shape=[1], name=x_init_name) x = variables.Variable(x_init, name=x_init_name) run_options = config_pb2.RunOptions() debug_utils.add_debug_tensor_watch( run_options, x_init_name, 0, debug_urls=[debug_server_url]) try: sess.run(x.initializer, options=run_options) except errors.FailedPreconditionError: pass if dump_dir: if os.path.isdir( dump_dir) and debug_data.DebugDumpDir(dump_dir).size > 0: shutil.rmtree(dump_dir) print("Poll succeeded.") return True else: print("Poll failed. Sleeping for %f s" % sleep_per_poll_sec) time.sleep(sleep_per_poll_sec) else: if server.debug_tensor_values: print("Poll succeeded.") return True else: print("Poll failed. Sleeping for %f s" % sleep_per_poll_sec) time.sleep(sleep_per_poll_sec) return False
def main(unused_argv=None): tf.logging.set_verbosity(tf.logging.INFO) #if not tf.gfile.Exists(FLAGS.output_dir): # tf.gfile.MkDir(FLAGS.output_dir) if FLAGS.tensorrt: gpu_options = None print(trt.trt_convert.get_linked_tensorrt_version()) gpu_options = cpb2.GPUOptions( per_process_gpu_memory_fraction=_GPU_MEM_FRACTION) sessconfig = cpb2.ConfigProto(gpu_options=gpu_options) else: sessconfig = None # Instantiate video capture object. cap = cv2.VideoCapture(1) # Set resolution # if resolution is not None: x_length, y_length = (1024, 1280) cap.set(3, x_length) # 3 and 4 are OpenCV property IDs. cap.set(4, y_length) cap.read() x_new = int(cap.get(3)) y_new = int(cap.get(4)) print('Resolution is: {0} by {1}'.format(x_new, y_new)) with tf.Graph().as_default(), tf.Session(config=sessconfig) as sess: #TODO - calculate these dimensions dynamically (they can't use None since TensorRT # needs precalculated dimensions # Defines place holder for the style image. style_img_ph = tf.placeholder(tf.float32, shape=[200, 1200, 3], name="style_img_ph") if FLAGS.style_square_crop: style_img_preprocessed = image_utils.center_crop_resize_image( style_img_ph, FLAGS.style_image_size) else: style_img_preprocessed = image_utils.resize_image( style_img_ph, FLAGS.style_image_size) # Defines place holder for the content image. content_img_ph = tf.placeholder(tf.float32, shape=[200, 1200, 3], name="content_img_ph") if FLAGS.content_square_crop: content_img_preprocessed = image_utils.center_crop_resize_image( content_img_ph, FLAGS.image_size) else: content_img_preprocessed = image_utils.resize_image( content_img_ph, FLAGS.image_size) # Defines the model. stylized_images, _, _, bottleneck_feat = build_model.build_model( content_img_preprocessed, style_img_preprocessed, trainable=False, is_training=False, inception_end_point='Mixed_6e', style_prediction_bottleneck=100, adds_losses=False) print(stylized_images) print(bottleneck_feat) if tf.gfile.IsDirectory(FLAGS.checkpoint): checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint) else: checkpoint = FLAGS.checkpoint tf.logging.info( 'loading latest checkpoint file: {}'.format(checkpoint)) init_fn = slim.assign_from_checkpoint_fn( checkpoint, slim.get_variables_to_restore()) sess.run([tf.local_variables_initializer()]) init_fn(sess) tf.train.write_graph(sess.graph_def, '.', 'model.pbtxt') if FLAGS.tensorrt: # We use a built-in TF helper to export variables to constants output_graph_def = tf.graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights tf.get_default_graph().as_graph_def( ), # The graph_def is used to retrieve the nodes [ 'transformer/expand/conv3/conv/Sigmoid' ] # The output node names are used to select the usefull nodes ) trt_graph = trt.create_inference_graph( input_graph_def=output_graph_def, outputs=["transformer/expand/conv3/conv/Sigmoid"], max_workspace_size_bytes=5 << 30, max_batch_size=1, precision_mode= "FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=10) bottleneck_feat_O, content_img_ph_O, stylized_images_O = importer.import_graph_def( graph_def=trt_graph, return_elements=[ "Conv/BiasAdd", "content_img_ph", "transformer/expand/conv3/conv/Sigmoid" ]) bottleneck_feat_O = bottleneck_feat_O.outputs[0] content_img_ph_O = content_img_ph_O.outputs[0] stylized_images_O = stylized_images_O.outputs[0] print("bottleneck opt:" + str(bottleneck_feat_O)) print(content_img_ph_O) print(stylized_images_O) # Gets the list of the input style images. #style_img_list = tf.gfile.Glob(FLAGS.style_images_paths) # if len(style_img_list) > FLAGS.maximum_styles_to_evaluate: # np.random.seed(1234) # style_img_list = np.random.permutation(style_img_list) # style_img_list = style_img_list[:FLAGS.maximum_styles_to_evaluate] # Gets list of input co ntent images. # content_img_list = tf.gfile.Glob(FLAGS.content_images_paths) # if style_i % 10 == 0: # tf.logging.info('Stylizing %s with (%d) %s' % # ( content_img_name, style_i, # style_img_name)) # for style_i, style_img_path in enumerate(style_img_list): # if style_i > FLAGS.maximum_styles_to_evaluate: # break interpolation_weight = FLAGS.interpolation_weight activate_style = None while True: start = timer() #calculating style isn't the major FPS bottleneck current_style = Style.objects.filter(is_active=True).first() if (activate_style != current_style): activate_style = current_style style_img_path = activate_style.source_file.path print("current image is " + style_img_path) style_img_name = "bricks" style_image_np = image_utils.load_np_image_uint8( style_img_path)[:, :, :3] style_image_np = cv2.resize(style_image_np, (1200, 200)) # Saves preprocessed style image. style_img_croped_resized_np = sess.run( style_img_preprocessed, feed_dict={style_img_ph: style_image_np}) #image_utils.save_np_image(style_img_croped_resized_np, # os.path.join(FLAGS.output_dir, # '%s.jpg' % (style_img_name))) # Computes bottleneck features of the style prediction network for the # given style image. style_params = sess.run( bottleneck_feat, feed_dict={style_img_ph: style_image_np}) # for content_i, content_img_path in enumerate(content_img_list): ret, frame = cap.read() print("webcam image: " + str(frame.shape)) #crop to get the weird 1200x200 format content_img_np = frame[500:700, 80:1280] #content_img_np = frame print("cropped image:" + str(content_img_np.shape)) # content_img_np = image_utils.load_np_image_uint8(content_img_path)[:, :, : # 3] # content_img_name = os.path.basename(content_img_path)[:-4] content_img_name = "webcam" # Saves preprocessed content image. print("Input image:" + str(content_img_np.shape)) inp_img_croped_resized_np = sess.run( content_img_preprocessed, feed_dict={content_img_ph: content_img_np}) # image_utils.save_np_image(inp_img_croped_resized_np, # os.path.join(FLAGS.output_dir, # '%s.jpg' % (content_img_name))) # Computes bottleneck features of the style prediction network for the # identity transform. identity_params = sess.run( bottleneck_feat, feed_dict={style_img_ph: content_img_np}) # Interpolates between the parameters of the identity transform and # style parameters of the given style image. wi = interpolation_weight style_np = identity_params * (1 - wi) + style_params * wi if FLAGS.tensorrt: style_np = np.reshape(style_np, (1, 100, 1, 1)) stylized_image_res = sess.run(stylized_images_O, feed_dict={ bottleneck_feat_O: style_np, content_img_ph_O: content_img_np }) else: stylized_image_res = sess.run(stylized_images, feed_dict={ bottleneck_feat: style_np, content_img_ph: content_img_np }) end = timer() print(end - start) print(stylized_image_res.shape) # Saves stylized image. # image_utils.save_np_image( # stylized_image_res, # os.path.join(FLAGS.output_dir, '%s_stylized_%s_%d.jpg' % # (content_img_name, style_img_name, interp_i))) display_np_image(stylized_image_res, FLAGS.showFullScreen) print(stylized_image_res.shape) # if cv2.waitKey(1) & 0xFF == ord('q'): # break #img_out = np.squeeze(stylized_image_res).astype(np.uint8) #img_out = cv2.cvtColor(img_out, cv2.COLOR_BGR2RGB) #cv2.imshow('frame', img_out) key = cv2.waitKey(10) print("Key " + str(key)) if key == 27: break elif key == 192: FLAGS.showFullScreen = False cv2.setWindowProperty("window", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_NORMAL) elif (key == 233 or key == 193): FLAGS.showFullScreen = True cv2.setWindowProperty("window", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) elif key == 60: # less interpolation_weight -= 0.25 elif key == 62: # > more interpolation_weight += 0.25 #if cv2.waitKey(1) & 0xFF == ord('q'): # break cap.release() cv2.destroyAllWindows()
def __init__(self, master=None, num_cores=0, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=_USE_DEFAULT, save_checkpoints_steps=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, evaluation_master='', model_dir=None, session_config=None): """Constructor. Note that the superclass `ClusterConfig` may set properties like `cluster_spec`, `is_chief`, `master` (if `None` in the args), `num_ps_replicas`, `task_id`, and `task_type` based on the `TF_CONFIG` environment variable. See `ClusterConfig` for more details. Args: master: TensorFlow master. Defaults to empty string for local. num_cores: Number of cores to be used. If 0, the system picks an appropriate number (default: 0). log_device_placement: Log the op placement to devices (default: False). gpu_memory_fraction: Fraction of GPU memory used by the process on each GPU uniformly on the same machine. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If None or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. evaluation_master: the master on which to perform evaluation. model_dir: directory where model parameters, graph etc are saved. If `None`, see `Estimator` about where the model will be saved. session_config: a ConfigProto used to set session parameters, or None. Note - using this argument, it is easy to provide settings which break otherwise perfectly good models. Use with care. """ super(RunConfig, self).__init__(master=master, evaluation_master=evaluation_master) gpu_options = config_pb2.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) self._tf_config = config_pb2.ConfigProto( log_device_placement=log_device_placement, inter_op_parallelism_threads=num_cores, intra_op_parallelism_threads=num_cores, gpu_options=gpu_options) self._tf_random_seed = tf_random_seed self._save_summary_steps = save_summary_steps self._save_checkpoints_secs = save_checkpoints_secs self._session_config = session_config if save_checkpoints_secs == RunConfig._USE_DEFAULT: if save_checkpoints_steps is None: self._save_checkpoints_secs = 600 else: self._save_checkpoints_secs = None self._save_checkpoints_steps = save_checkpoints_steps # TODO(weiho): Remove these after ModelFn refactoring, when users can # create Scaffold and Saver in their model_fn to set these. self._keep_checkpoint_max = keep_checkpoint_max self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours self._model_dir = model_dir
def common_run_context(self, var_list, opt_list, name): batch_size = 2 sample_length = 3 emb_domain_list = list() tws = list() cluster = ps_worker_cluster(ps_num=2) ps_servers, worker_servers, cluster_def = cluster config = config_pb2.ConfigProto( cluster_def=cluster_def, experimental=config_pb2.ConfigProto.Experimental( share_session_state_in_clusterspec_propagation=True, ), allow_soft_placement=False, inter_op_parallelism_threads=2, intra_op_parallelism_threads=2, gpu_options=config_pb2.GPUOptions(allow_growth=True), ) dev_placement = device_setter.replica_device_setter( ps_tasks=2, ps_device='/job:ps', worker_device='/job:worker', cluster=cluster_def, ) with ops.device(dev_placement): shared_var_0 = deo.get_variable('distributed_sp_var_0', initializer=0.0, devices=['/job:worker/task:0'], dim=8) shared_var_1 = deo.get_variable('distributed_sp_var_1', initializer=0.0, devices=['/job:worker/task:0'], dim=4) opt_list = get_multiple_optimizers() distributed_var_list = [shared_var_0, shared_var_1] for _v in distributed_var_list: ids = random_ops.random_uniform((batch_size, sample_length), maxval=1000000, dtype=_v.key_dtype) ids = array_ops.reshape(ids, (-1, )) _, tw = deo.embedding_lookup(_v, ids, return_trainable=True) tws.append(tw) _collapse = array_ops.reshape(tw, (batch_size, -1)) _logits = math_ops.reduce_sum(_collapse, axis=1) _logits = math_ops.cast(_logits, dtypes.float32) emb_domain_list.append(_logits) logits = math_ops.add_n(emb_domain_list) labels = array_ops.zeros((batch_size, ), dtype=dtypes.float32) loss = math_ops.reduce_mean( nn_impl.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, )) _train_ops = list() for _opt in opt_list: _train_ops.append(_opt.minimize(loss)) train_op = control_flow_ops.group(_train_ops) restrictor = dvr.VariableRestrictor(var_list=distributed_var_list, optimizer_list=opt_list) update_op = restrictor.update() threshold = int(batch_size * sample_length * 1.5) factor = 1.2 restrict_op = restrictor.restrict(threshold=threshold, factor=factor) policies = list(itertools.chain(*restrictor.policy_group.values())) tstp_vars = [policy.tstp_var for policy in policies] slot_vars = list() for tw in tws: for opt in opt_list: slot_vars += select_slot_vars(tw, opt) with session.Session(worker_servers[0].target, config=config) as sess: sess.run(variables.global_variables_initializer()) n, MAX_ITER = 0, 1000 while n < MAX_ITER: sess.run([train_op, update_op]) if all( sess.run(var.size()) > threshold * factor for var in distributed_var_list): break s1 = sess.run([var.size() for var in distributed_var_list]) s2 = sess.run([tv.size() for tv in tstp_vars]) s3 = sess.run([sv.size() for sv in slot_vars]) self.assertAllGreater(s1, threshold * factor) self.assertAllGreater(s2, threshold * factor) if s3: self.assertAllGreater(s3, threshold * factor) sess.run(restrict_op) s1 = sess.run([var.size() for var in distributed_var_list]) s2 = sess.run([tv.size() for tv in tstp_vars]) s3 = sess.run([sv.size() for sv in slot_vars]) self.assertAllLess(s1, threshold * factor + 1) self.assertAllLess(s2, threshold * factor + 1) if s3: self.assertAllLess(s3, threshold * factor + 1) sess.close()
import tensorflow as tf from tensorflow.core.protobuf import config_pb2 from tensorflow.python.client import device_lib virtual_device_gpu_options = config_pb2.GPUOptions( visible_device_list='0', experimental=config_pb2.GPUOptions.Experimental(virtual_devices=[ config_pb2.GPUOptions.Experimental.VirtualDevices( memory_limit_mb=[200, 300]) ])) config = config_pb2.ConfigProto(gpu_options=virtual_device_gpu_options) device_lib.list_local_devices(session_config=config) with tf.Session(config=config) as sess: with tf.device('/gpu:1'): result = sess.run(tf.constand(42))
def common_minimize_trainable(self, base_opt, test_opt, name): base_opt = de.DynamicEmbeddingOptimizer(base_opt) test_opt = de.DynamicEmbeddingOptimizer(test_opt) id = 0 config = config_pb2.ConfigProto( allow_soft_placement=True, gpu_options=config_pb2.GPUOptions(allow_growth=True), ) for ( num_shards, k_dtype, d_dtype, initial_mode, dim, run_step, ) in itertools.product( [1, 2], [dtypes.int64], [ dtypes.float32, ], [ "constant", ], [1, 10], [10], ): with self.session(config=config, use_gpu=test_util.is_gpu_available()): id += 1 raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] raw_init_vals = [ [ x, ] * dim for x in [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81] ] raw_ids = constant_op.constant([1, 3, 3, 9], dtype=k_dtype) sp_ids = sparse_tensor.SparseTensor( indices=[ [0, 0], [0, 1], [1, 0], [2, 1], ], values=raw_ids, dense_shape=[3, 2], ) x = constant_op.constant([[_x * dim] for _x in [[0.4], [0.5], [0.6]]], dtype=d_dtype) x = array_ops.reshape(x, shape=(3 * dim, 1)) # base var prepare base_var = variables.Variable( np.array(raw_init_vals).reshape([len(raw_init_ids), dim]), dtype=d_dtype, shape=[len(raw_init_ids), dim], ) base_embedding = embedding_ops.safe_embedding_lookup_sparse( base_var, sp_ids, None, combiner="sum") base_embedding = array_ops.reshape(base_embedding, shape=[1, 3 * dim]) pred0 = math_ops.matmul(base_embedding, x) loss0 = pred0 * pred0 base_opt_op = base_opt.minimize(loss0, var_list=[base_var]) # test var prepare embeddings = de.get_variable( "s6030-" + name + str(id), key_dtype=k_dtype, value_dtype=d_dtype, devices=_get_devices() * num_shards, initializer=1.0, dim=dim, ) self.device_check(embeddings) init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype) init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype) init_op = embeddings.upsert(init_ids, init_vals) self.evaluate(init_op) # test branch test_var, trainable = de.safe_embedding_lookup_sparse( embeddings, sp_ids, sparse_weights=None, combiner="sum", return_trainable=True, ) pred1 = math_ops.matmul( array_ops.reshape(test_var, shape=[1, 3 * dim]), x) loss1 = pred1 * pred1 test_opt_op = test_opt.minimize(loss1, var_list=[trainable]) self.evaluate(variables.global_variables_initializer()) self.assertAllCloseAccordingToType( np.array(raw_init_vals).reshape([len(raw_init_ids), dim]), self.evaluate(base_var), ) # run base for _ in range(run_step): self.evaluate(base_opt_op) # Run `run_step` step of sgd for _ in range(run_step): self.evaluate(test_opt_op) table_var = array_ops.reshape(embeddings.lookup(init_ids), shape=[10, dim]) # Validate updated params self.assertAllCloseAccordingToType( self.evaluate(base_var), self.evaluate(table_var), msg="Cond:{},{},{},{},{}".format(num_shards, k_dtype, d_dtype, dim, run_step), )
def __init__(self, master=None, num_cores=0, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=_USE_DEFAULT, save_checkpoints_steps=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, protocol=None, evaluation_master='', model_dir=None, session_config=None): """Constructor. The superclass `ClusterConfig` may set properties like `cluster_spec`, `is_chief`, `master` (if `None` in the args), `num_ps_replicas`, `task_id`, and `task_type` based on the `TF_CONFIG` environment variable. See `ClusterConfig` for more details. N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set, `keep_checkpoint_max` might need to be adjusted accordingly, especially in distributed training. For example, setting `save_checkpoints_secs` as 60 without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation that checkpoint would be garbage collected after 5 minutes. In distributed training, the evaluation job starts asynchronously and might fail to load or find the checkpoint due to race condition. Args: master: TensorFlow master. Defaults to empty string for local. num_cores: Number of cores to be used. If 0, the system picks an appropriate number (default: 0). log_device_placement: Log the op placement to devices (default: False). gpu_memory_fraction: Fraction of GPU memory used by the process on each GPU uniformly on the same machine. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If None or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. log_step_count_steps: The frequency, in number of global steps, that the global step/sec will be logged during training. evaluation_master: the master on which to perform evaluation. model_dir: directory where model parameters, graph etc are saved. If `None`, will use `model_dir` property in `TF_CONFIG` environment variable. If both are set, must have same value. If both are `None`, see `Estimator` about where the model will be saved. session_config: a ConfigProto used to set session parameters, or None. Note - using this argument, it is easy to provide settings which break otherwise perfectly good models. Use with care. protocol: An optional argument which specifies the protocol used when starting server. None means default to grpc. """ # Neither parent class calls super().__init__(), so here we have to # manually call their __init__() methods. ClusterConfig.__init__(self, master=master, evaluation_master=evaluation_master) # For too long this code didn't call: # core_run_config.RunConfig.__init__(self) # so instead of breaking compatibility with that assumption, we # just manually initialize this field: self._train_distribute = None self._eval_distribute = None self._experimental_max_worker_delay_secs = None self._device_fn = None gpu_options = config_pb2.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) self._tf_config = config_pb2.ConfigProto( log_device_placement=log_device_placement, inter_op_parallelism_threads=num_cores, intra_op_parallelism_threads=num_cores, gpu_options=gpu_options) self._tf_random_seed = tf_random_seed self._save_summary_steps = save_summary_steps self._save_checkpoints_secs = save_checkpoints_secs self._log_step_count_steps = log_step_count_steps self._protocol = protocol self._session_config = session_config if save_checkpoints_secs == RunConfig._USE_DEFAULT: if save_checkpoints_steps is None: self._save_checkpoints_secs = 600 else: self._save_checkpoints_secs = None self._save_checkpoints_steps = save_checkpoints_steps # TODO(weiho): Remove these after ModelFn refactoring, when users can # create Scaffold and Saver in their model_fn to set these. self._keep_checkpoint_max = keep_checkpoint_max self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours self._model_dir = _get_model_dir(model_dir)
def common_minimize_trainable(self, base_opt, test_opt, name): if test_util.is_gpu_available(): keys_type_list = [dtypes.int64] else: keys_type_list = [dtypes.int64, dtypes.string] deo.enable_train_mode() config = config_pb2.ConfigProto( allow_soft_placement=True, gpu_options=config_pb2.GPUOptions(allow_growth=True)) for run_id, num_shards, k_dtype, d_dtype, initial_mode, dim, run_step \ in _next_run_step_config(keys_type_list): with self.session(config=config, use_gpu=test_util.is_gpu_available()): raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] if k_dtype == dtypes.string: raw_init_ids = [str(i) for i in raw_init_ids] raw_init_vals = [ [ x, ] * dim for x in [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81] ] raw_ids_py = [1, 3, 3, 9] raw_ids_nn = constant_op.constant(raw_ids_py, dtype=dtypes.int64) sp_ids_nn = sparse_tensor.SparseTensor(indices=[ [0, 0], [0, 1], [1, 0], [2, 1], ], values=raw_ids_nn, dense_shape=[3, 2]) if k_dtype != dtypes.string: raw_ids_de = raw_ids_nn else: raw_ids_de = constant_op.constant( [str(i) for i in raw_ids_py], dtype=k_dtype) sp_ids_de = sparse_tensor.SparseTensor(indices=[ [0, 0], [0, 1], [1, 0], [2, 1], ], values=raw_ids_de, dense_shape=[3, 2]) x = constant_op.constant([[_x * dim] for _x in [[0.4], [0.5], [0.6]]], dtype=d_dtype) x = array_ops.reshape(x, shape=(3 * dim, 1)) # base var prepare base_var = variables.Variable(np.array(raw_init_vals).reshape( [len(raw_init_ids), dim]), dtype=d_dtype, shape=[len(raw_init_ids), dim]) base_embedding = embedding_ops.safe_embedding_lookup_sparse( base_var, sp_ids_nn, None, combiner='sum') base_embedding = array_ops.reshape(base_embedding, shape=[1, 3 * dim]) pred0 = math_ops.matmul(base_embedding, x) loss0 = pred0 * pred0 base_opt_op = base_opt.minimize(loss0, var_list=[base_var]) # test var prepare embeddings = deo.get_variable('s6030-' + name + str(run_id), key_dtype=k_dtype, value_dtype=d_dtype, devices=_get_devices() * num_shards, initializer=1., dim=dim) self.device_check(embeddings) init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype) init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype) init_op = embeddings.upsert(init_ids, init_vals) self.evaluate(init_op) # test branch test_var, trainable = deo.safe_embedding_lookup_sparse( embeddings, sp_ids_de, sparse_weights=None, combiner="sum", return_trainable=True) pred1 = math_ops.matmul( array_ops.reshape(test_var, shape=[1, 3 * dim]), x) loss1 = pred1 * pred1 test_opt_op = test_opt.minimize(loss1, var_list=[trainable]) self.evaluate(variables.global_variables_initializer()) self.assertAllCloseAccordingToType( np.array(raw_init_vals).reshape([len(raw_init_ids), dim]), self.evaluate(base_var)) # run base for _ in range(run_step): self.evaluate(base_opt_op) # Run `run_step` step of sgd for _ in range(run_step): self.evaluate(test_opt_op) table_var = array_ops.reshape(embeddings.lookup(init_ids), shape=[10, dim]) # Validate updated params self.assertAllCloseAccordingToType( self.evaluate(base_var), self.evaluate(table_var), msg="Cond:{},{},{},{},{}".format(num_shards, k_dtype, d_dtype, dim, run_step))