예제 #1
0
    def testBasicNcclReduce(self):
        inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
        expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
        group_size = len(inputs)
        group_key = 1
        instance_key = 1
        # Configure virtual GPU devices
        device_type = 'GPU'
        virtual_devices = [
            config_pb2.GPUOptions.Experimental.VirtualDevices(
                memory_limit_mb=([1 << 10] * group_size))
        ]  # 1 GB per virtual GPU
        gpu_options = config_pb2.GPUOptions(
            visible_device_list='0',
            experimental=config_pb2.GPUOptions.Experimental(
                virtual_devices=virtual_devices))
        # Configure NCCL
        experimental = config_pb2.ConfigProto.Experimental(
            collective_nccl=True)
        os.environ['NCCL_DEBUG'] = 'INFO'
        os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL'
        config = config_pb2.ConfigProto(gpu_options=gpu_options,
                                        experimental=experimental)
        devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)]

        with self.session(config=config) as sess:
            if not test_util.is_gpu_available(cuda_only=True):
                self.skipTest('No GPU available')
            colred = []
            for i in range(group_size):
                with ops.device(devices[i]):
                    tensor = constant_op.constant(inputs[i])
                    colred.append(
                        collective_ops.all_reduce(tensor, group_size,
                                                  group_key, instance_key,
                                                  'Add', 'Div'))
            run_options = config_pb2.RunOptions()
            results = sess.run(colred, options=run_options)
        for i in range(group_size):
            self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
예제 #2
0
  def testV1Compatibility(self):
    # Ensure we set 1 CPU by default
    context.context()._config = config_pb2.ConfigProto()
    new_config = context.context().config
    self.assertEqual(new_config.device_count['CPU'], 1)
    context.context()._physical_devices = None

    # Ensure CPU is split
    context.context()._config = config_pb2.ConfigProto(device_count={'CPU': 2},)
    new_config = context.context().config
    self.assertEqual(new_config.device_count['CPU'], 2)
    context.context()._physical_devices = None

    # Ensure Handle visible device list parsing
    context.context()._config = config_pb2.ConfigProto(
        gpu_options=config_pb2.GPUOptions(visible_device_list='',),)
    gpus = config.list_physical_devices('GPU')
    new_config = context.context().config
    self.assertEqual(new_config.gpu_options.visible_device_list,
                     ','.join(str(i) for i in range(len(gpus))))
    context.context()._physical_devices = None
예제 #3
0
  def testSetConfiguration(self):
    config = config_pb2.ConfigProto(
        gpu_options=config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.1))

    # Configure a server using the default local server options.
    server = server_lib.Server.create_local_server(config=config, start=False)
    self.assertEqual(0.1, server.server_def.default_session_config.gpu_options.
                     per_process_gpu_memory_fraction)

    # Configure a server using an explicit ServerDefd with an
    # overridden config.
    cluster_def = server_lib.ClusterSpec({
        "localhost": ["localhost:0"]
    }).as_cluster_def()
    server_def = tensorflow_server_pb2.ServerDef(
        cluster=cluster_def,
        job_name="localhost",
        task_index=0,
        protocol="grpc")
    server = server_lib.Server(server_def, config=config, start=False)
    self.assertEqual(0.1, server.server_def.default_session_config.gpu_options.
                     per_process_gpu_memory_fraction)
  def _GetConfigProto(self, run_params, graph_state):
    """Get config proto based on specific settings."""
    if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
      trt_params = self.GetConversionParams(run_params)
      rewriter_cfg = trt_convert.tensorrt_rewriter_config(
          trt_params.max_batch_size, trt_params.max_workspace_size_bytes,
          trt_params.precision_mode, trt_params.minimum_segment_size,
          trt_params.is_dynamic_op, trt_params.maximum_cached_engines,
          trt_params.cached_engine_batch_sizes)

      graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
    else:
      graph_options = config_pb2.GraphOptions()

    gpu_options = config_pb2.GPUOptions()
    gpu_options.allow_growth = True
    if trt_convert.get_linked_tensorrt_version()[0] == 3:
      gpu_options.per_process_gpu_memory_fraction = 0.50

    config = config_pb2.ConfigProto(
        gpu_options=gpu_options, graph_options=graph_options)
    return config
예제 #5
0
    def _GetConfigProto(self, run_params, graph_state):
        """Get config proto based on specific settings."""
        if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
            rewriter_cfg = rewriter_config_pb2.RewriterConfig()
            rewriter_cfg.optimizers.extend(["constfold", "layout"])
            custom_op = rewriter_cfg.custom_optimizers.add()
            custom_op.name = "TensorRTOptimizer"
            trt_params = self.GetConversionParams(run_params)
            custom_op.parameter_map[
                "max_batch_size"].i = trt_params.max_batch_size
            custom_op.parameter_map["max_workspace_size_bytes"].i = (
                trt_params.max_workspace_size_bytes)
            custom_op.parameter_map[
                "precision_mode"].s = trt_params.precision_mode
            custom_op.parameter_map["minimum_segment_size"].i = (
                trt_params.minimum_segment_size)
            custom_op.parameter_map[
                "is_dynamic_op"].b = trt_params.is_dynamic_op
            custom_op.parameter_map["maximum_cached_engines"].i = (
                trt_params.maximum_cached_engines)
            if trt_params.cached_engine_batches:
                custom_op.parameter_map["cached_engine_batches"].list.i.extend(
                    trt_params.cached_engine_batches)

            graph_options = config_pb2.GraphOptions(
                rewrite_options=rewriter_cfg)
        else:
            graph_options = config_pb2.GraphOptions()

        gpu_options = config_pb2.GPUOptions()
        gpu_options.allow_growth = True
        if trt_convert.get_linked_tensorrt_version()[0] == 3:
            gpu_options.per_process_gpu_memory_fraction = 0.50

        config = config_pb2.ConfigProto(gpu_options=gpu_options,
                                        graph_options=graph_options)
        return config
예제 #6
0
  def __init__(self, target='', graph=None, config=None):
    """Creates a new interactive TensorFlow session.

    If no `graph` argument is specified when constructing the session,
    the default graph will be launched in the session. If you are
    using more than one graph (created with `tf.Graph()` in the same
    process, you will have to use different sessions for each graph,
    but each graph can be used in multiple sessions. In this case, it
    is often clearer to pass the graph to be launched explicitly to
    the session constructor.

    Args:
      target: (Optional.) The execution engine to connect to.
        Defaults to using an in-process engine.
      graph: (Optional.) The `Graph` to be launched (described above).
      config: (Optional) `ConfigProto` proto used to configure the session.
    """
    if not config:
      # If config is not provided, choose some reasonable defaults for
      # interactive use:
      #
      #   - Grow GPU memory as needed at the cost of fragmentation.
      gpu_options = config_pb2.GPUOptions(allow_growth=True)
      config = config_pb2.ConfigProto(gpu_options=gpu_options)
    # Interactive sessions always place pruned graphs.
    config.graph_options.place_pruned_graph = True

    super(InteractiveSession, self).__init__(target, graph, config)
    self._default_session = self.as_default()
    self._default_session.enforce_nesting = False
    self._default_session.__enter__()
    self._explicit_graph = graph
    if self._explicit_graph is not None:
      self._default_graph = graph.as_default()
      self._default_graph.enforce_nesting = False
      self._default_graph.__enter__()
예제 #7
0
def auto(multi_engine):
    """Run the conversion as an optimization pass."""
    if multi_engine:
        inp_dims = (2, 3, 7, 5)
        orig_graph = get_multi_engine_graph_def()
    else:
        inp_dims = (100, 24, 24, 2)
        orig_graph = get_simple_graph_def()  # use a frozen graph for inference
    dummy_input = np.random.random_sample(inp_dims)
    opt_config = rwpb2.RewriterConfig()
    opt_config.meta_optimizer_iterations = opt_config.ONE
    opt_config.optimizers.extend(["constfold", "layout"])
    custom_op = opt_config.custom_optimizers.add()
    custom_op.name = "TensorRTOptimizer"
    custom_op.parameter_map["minimum_segment_size"].i = 3
    custom_op.parameter_map["precision_mode"].s = to_bytes("FP32")
    custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
    custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
    print(custom_op)
    gpu_options = None
    if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
        gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
    graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
    sessconfig = cpb2.ConfigProto(gpu_options=gpu_options,
                                  graph_options=graph_options)
    print(sessconfig)
    g = ops.Graph()
    ops.reset_default_graph()
    with g.as_default():
        inp, out = importer.import_graph_def(
            graph_def=orig_graph, return_elements=["input", "output"], name="")
        inp = inp.outputs[0]
        out = out.outputs[0]
        with csess.Session(config=sessconfig, graph=g) as sess:
            val = sess.run(out, {inp: dummy_input})
    print(val.shape)
예제 #8
0
    def setUpClass(cls):
        gpu_memory_fraction_opt = ("--gpu_memory_fraction=%f" %
                                   cls.PER_PROC_GPU_MEMORY_FRACTION)

        worker_port = portpicker.pick_unused_port()
        cluster_spec = "worker|localhost:%d" % worker_port
        tf_logging.info("cluster_spec: %s", cluster_spec)

        server_bin = test.test_src_dir_path(
            "python/debug/grpc_tensorflow_server.par")

        cls.server_target = "grpc://localhost:%d" % worker_port

        cls.server_procs = {}
        cls.server_procs["worker"] = subprocess.Popen([
            server_bin,
            "--logtostderr",
            "--cluster_spec=%s" % cluster_spec,
            "--job_name=worker",
            "--task_id=0",
            gpu_memory_fraction_opt,
        ],
                                                      stdout=sys.stdout,
                                                      stderr=sys.stderr)

        # Start debug server in-process, on separate thread.
        (cls.debug_server_port, cls.debug_server_url, _,
         cls.debug_server_thread, cls.debug_server
         ) = grpc_debug_test_server.start_server_on_separate_thread(
             dump_to_filesystem=False)
        tf_logging.info("debug server url: %s", cls.debug_server_url)

        cls.session_config = config_pb2.ConfigProto(
            gpu_options=config_pb2.GPUOptions(
                per_process_gpu_memory_fraction=cls.
                PER_PROC_GPU_MEMORY_FRACTION))
예제 #9
0
 def _GetConfigProto(self):
     """Get ConfigProto for session creation."""
     config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
         allow_growth=True))
     return config
예제 #10
0
from tensorflow_recommenders_addons import dynamic_embedding as de

from tensorflow.core.protobuf import config_pb2
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import test_util
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.training import adam
from tensorflow.python.training import monitored_session
from tensorflow.python.training import training_util

default_config = config_pb2.ConfigProto(
    allow_soft_placement=True,
    gpu_options=config_pb2.GPUOptions(allow_growth=True))


class HorovodTest(test.TestCase):
    @test_util.deprecated_graph_mode_only
    def test_adam_minimize_trainable(self):
        base_opt = adam.AdamOptimizer(1.0)
        test_opt = adam.AdamOptimizer(1.0)
        self.common_minimize_trainable(base_opt, test_opt, name="adam")

    def common_minimize_trainable(self, base_opt, test_opt, name):
        from tensorflow.python.framework.errors_impl import NotFoundError

        # TODO(rhdong): Recover the testing, if the horovod import error is fixed on macOS+TF2.7+.
        try:
            import horovod.tensorflow as hvd
 def _GetGPUOptions(self):
     gpu_options = config_pb2.GPUOptions()
     gpu_options.allow_growth = True
     return gpu_options
예제 #12
0
def _poll_server_till_success(max_attempts,
                              sleep_per_poll_sec,
                              debug_server_url,
                              dump_dir,
                              server,
                              gpu_memory_fraction=1.0):
  """Poll server until success or exceeding max polling count.

  Args:
    max_attempts: (int) How many times to poll at maximum
    sleep_per_poll_sec: (float) How many seconds to sleep for after each
      unsuccessful poll.
    debug_server_url: (str) gRPC URL to the debug server.
    dump_dir: (str) Dump directory to look for files in. If None, will directly
      check data from the server object.
    server: The server object.
    gpu_memory_fraction: (float) Fraction of GPU memory to be
      allocated for the Session used in server polling.

  Returns:
    (bool) Whether the polling succeeded within max_polls attempts.
  """
  poll_count = 0

  config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
      per_process_gpu_memory_fraction=gpu_memory_fraction))
  with session.Session(config=config) as sess:
    for poll_count in range(max_attempts):
      server.clear_data()
      print("Polling: poll_count = %d" % poll_count)

      x_init_name = "x_init_%d" % poll_count
      x_init = constant_op.constant([42.0], shape=[1], name=x_init_name)
      x = variables.Variable(x_init, name=x_init_name)

      run_options = config_pb2.RunOptions()
      debug_utils.add_debug_tensor_watch(
          run_options, x_init_name, 0, debug_urls=[debug_server_url])
      try:
        sess.run(x.initializer, options=run_options)
      except errors.FailedPreconditionError:
        pass

      if dump_dir:
        if os.path.isdir(
            dump_dir) and debug_data.DebugDumpDir(dump_dir).size > 0:
          shutil.rmtree(dump_dir)
          print("Poll succeeded.")
          return True
        else:
          print("Poll failed. Sleeping for %f s" % sleep_per_poll_sec)
          time.sleep(sleep_per_poll_sec)
      else:
        if server.debug_tensor_values:
          print("Poll succeeded.")
          return True
        else:
          print("Poll failed. Sleeping for %f s" % sleep_per_poll_sec)
          time.sleep(sleep_per_poll_sec)

    return False
def main(unused_argv=None):
    tf.logging.set_verbosity(tf.logging.INFO)
    #if not tf.gfile.Exists(FLAGS.output_dir):
    #    tf.gfile.MkDir(FLAGS.output_dir)

    if FLAGS.tensorrt:
        gpu_options = None
        print(trt.trt_convert.get_linked_tensorrt_version())
        gpu_options = cpb2.GPUOptions(
            per_process_gpu_memory_fraction=_GPU_MEM_FRACTION)
        sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
    else:
        sessconfig = None

    # Instantiate video capture object.
    cap = cv2.VideoCapture(1)

    # Set resolution
    # if resolution is not None:
    x_length, y_length = (1024, 1280)
    cap.set(3, x_length)  # 3 and 4 are OpenCV property IDs.
    cap.set(4, y_length)
    cap.read()
    x_new = int(cap.get(3))
    y_new = int(cap.get(4))
    print('Resolution is: {0} by {1}'.format(x_new, y_new))

    with tf.Graph().as_default(), tf.Session(config=sessconfig) as sess:

        #TODO - calculate these dimensions dynamically (they can't use None since TensorRT
        # needs precalculated dimensions

        # Defines place holder for the style image.
        style_img_ph = tf.placeholder(tf.float32,
                                      shape=[200, 1200, 3],
                                      name="style_img_ph")
        if FLAGS.style_square_crop:
            style_img_preprocessed = image_utils.center_crop_resize_image(
                style_img_ph, FLAGS.style_image_size)
        else:
            style_img_preprocessed = image_utils.resize_image(
                style_img_ph, FLAGS.style_image_size)

        # Defines place holder for the content image.
        content_img_ph = tf.placeholder(tf.float32,
                                        shape=[200, 1200, 3],
                                        name="content_img_ph")
        if FLAGS.content_square_crop:
            content_img_preprocessed = image_utils.center_crop_resize_image(
                content_img_ph, FLAGS.image_size)
        else:
            content_img_preprocessed = image_utils.resize_image(
                content_img_ph, FLAGS.image_size)

        # Defines the model.
        stylized_images, _, _, bottleneck_feat = build_model.build_model(
            content_img_preprocessed,
            style_img_preprocessed,
            trainable=False,
            is_training=False,
            inception_end_point='Mixed_6e',
            style_prediction_bottleneck=100,
            adds_losses=False)

        print(stylized_images)
        print(bottleneck_feat)

        if tf.gfile.IsDirectory(FLAGS.checkpoint):
            checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint)
        else:
            checkpoint = FLAGS.checkpoint
            tf.logging.info(
                'loading latest checkpoint file: {}'.format(checkpoint))

        init_fn = slim.assign_from_checkpoint_fn(
            checkpoint, slim.get_variables_to_restore())
        sess.run([tf.local_variables_initializer()])
        init_fn(sess)

        tf.train.write_graph(sess.graph_def, '.', 'model.pbtxt')

        if FLAGS.tensorrt:
            # We use a built-in TF helper to export variables to constants
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess,  # The session is used to retrieve the weights
                tf.get_default_graph().as_graph_def(
                ),  # The graph_def is used to retrieve the nodes 
                [
                    'transformer/expand/conv3/conv/Sigmoid'
                ]  # The output node names are used to select the usefull nodes
            )

            trt_graph = trt.create_inference_graph(
                input_graph_def=output_graph_def,
                outputs=["transformer/expand/conv3/conv/Sigmoid"],
                max_workspace_size_bytes=5 << 30,
                max_batch_size=1,
                precision_mode=
                "FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
                minimum_segment_size=10)

            bottleneck_feat_O, content_img_ph_O, stylized_images_O = importer.import_graph_def(
                graph_def=trt_graph,
                return_elements=[
                    "Conv/BiasAdd", "content_img_ph",
                    "transformer/expand/conv3/conv/Sigmoid"
                ])
            bottleneck_feat_O = bottleneck_feat_O.outputs[0]
            content_img_ph_O = content_img_ph_O.outputs[0]
            stylized_images_O = stylized_images_O.outputs[0]

            print("bottleneck opt:" + str(bottleneck_feat_O))
            print(content_img_ph_O)
            print(stylized_images_O)

        # Gets the list of the input style images.
        #style_img_list = tf.gfile.Glob(FLAGS.style_images_paths)
        # if len(style_img_list) > FLAGS.maximum_styles_to_evaluate:
        #    np.random.seed(1234)
        #    style_img_list = np.random.permutation(style_img_list)
        #    style_img_list = style_img_list[:FLAGS.maximum_styles_to_evaluate]

        # Gets list of input co ntent images.
        # content_img_list = tf.gfile.Glob(FLAGS.content_images_paths)

        # if style_i % 10 == 0:
        # tf.logging.info('Stylizing  %s with (%d) %s' %
        #                        ( content_img_name, style_i,
        #                         style_img_name))

        # for style_i, style_img_path in enumerate(style_img_list):
        # if style_i > FLAGS.maximum_styles_to_evaluate:
        #    break
        interpolation_weight = FLAGS.interpolation_weight
        activate_style = None

        while True:
            start = timer()
            #calculating style isn't the major FPS bottleneck
            current_style = Style.objects.filter(is_active=True).first()
            if (activate_style != current_style):
                activate_style = current_style
                style_img_path = activate_style.source_file.path
                print("current image is " + style_img_path)
                style_img_name = "bricks"
                style_image_np = image_utils.load_np_image_uint8(
                    style_img_path)[:, :, :3]
                style_image_np = cv2.resize(style_image_np, (1200, 200))

                # Saves preprocessed style image.
                style_img_croped_resized_np = sess.run(
                    style_img_preprocessed,
                    feed_dict={style_img_ph: style_image_np})
                #image_utils.save_np_image(style_img_croped_resized_np,
                #                          os.path.join(FLAGS.output_dir,
                #                                       '%s.jpg' % (style_img_name)))

                # Computes bottleneck features of the style prediction network for the
                # given style image.
                style_params = sess.run(
                    bottleneck_feat, feed_dict={style_img_ph: style_image_np})

            # for content_i, content_img_path in enumerate(content_img_list):
            ret, frame = cap.read()
            print("webcam image: " + str(frame.shape))
            #crop to get the weird 1200x200 format
            content_img_np = frame[500:700, 80:1280]
            #content_img_np = frame
            print("cropped image:" + str(content_img_np.shape))
            # content_img_np = image_utils.load_np_image_uint8(content_img_path)[:, :, :
            #                                                                        3]

            # content_img_name = os.path.basename(content_img_path)[:-4]
            content_img_name = "webcam"

            # Saves preprocessed content image.
            print("Input image:" + str(content_img_np.shape))
            inp_img_croped_resized_np = sess.run(
                content_img_preprocessed,
                feed_dict={content_img_ph: content_img_np})
            # image_utils.save_np_image(inp_img_croped_resized_np,
            #                          os.path.join(FLAGS.output_dir,
            #                                       '%s.jpg' % (content_img_name)))

            # Computes bottleneck features of the style prediction network for the
            # identity transform.
            identity_params = sess.run(
                bottleneck_feat, feed_dict={style_img_ph: content_img_np})

            # Interpolates between the parameters of the identity transform and
            # style parameters of the given style image.
            wi = interpolation_weight
            style_np = identity_params * (1 - wi) + style_params * wi
            if FLAGS.tensorrt:
                style_np = np.reshape(style_np, (1, 100, 1, 1))

                stylized_image_res = sess.run(stylized_images_O,
                                              feed_dict={
                                                  bottleneck_feat_O: style_np,
                                                  content_img_ph_O:
                                                  content_img_np
                                              })
            else:
                stylized_image_res = sess.run(stylized_images,
                                              feed_dict={
                                                  bottleneck_feat: style_np,
                                                  content_img_ph:
                                                  content_img_np
                                              })

            end = timer()
            print(end - start)
            print(stylized_image_res.shape)
            # Saves stylized image.
            # image_utils.save_np_image(
            #  stylized_image_res,
            #  os.path.join(FLAGS.output_dir, '%s_stylized_%s_%d.jpg' %
            #               (content_img_name, style_img_name, interp_i)))
            display_np_image(stylized_image_res, FLAGS.showFullScreen)
            print(stylized_image_res.shape)
            # if cv2.waitKey(1) & 0xFF == ord('q'):
            #  break
            #img_out = np.squeeze(stylized_image_res).astype(np.uint8)
            #img_out = cv2.cvtColor(img_out, cv2.COLOR_BGR2RGB)
            #cv2.imshow('frame', img_out)

            key = cv2.waitKey(10)
            print("Key " + str(key))
            if key == 27:
                break
            elif key == 192:
                FLAGS.showFullScreen = False
                cv2.setWindowProperty("window", cv2.WND_PROP_FULLSCREEN,
                                      cv2.WINDOW_NORMAL)
            elif (key == 233 or key == 193):
                FLAGS.showFullScreen = True
                cv2.setWindowProperty("window", cv2.WND_PROP_FULLSCREEN,
                                      cv2.WINDOW_FULLSCREEN)
            elif key == 60:  # less
                interpolation_weight -= 0.25
            elif key == 62:  # > more
                interpolation_weight += 0.25

            #if cv2.waitKey(1) & 0xFF == ord('q'):
            #    break

    cap.release()
    cv2.destroyAllWindows()
예제 #14
0
    def __init__(self,
                 master=None,
                 num_cores=0,
                 log_device_placement=False,
                 gpu_memory_fraction=1,
                 tf_random_seed=None,
                 save_summary_steps=100,
                 save_checkpoints_secs=_USE_DEFAULT,
                 save_checkpoints_steps=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 evaluation_master='',
                 model_dir=None,
                 session_config=None):
        """Constructor.

    Note that the superclass `ClusterConfig` may set properties like
    `cluster_spec`, `is_chief`, `master` (if `None` in the args),
    `num_ps_replicas`, `task_id`, and `task_type` based on the `TF_CONFIG`
    environment variable. See `ClusterConfig` for more details.

    Args:
      master: TensorFlow master. Defaults to empty string for local.
      num_cores: Number of cores to be used. If 0, the system picks an
        appropriate number (default: 0).
      log_device_placement: Log the op placement to devices (default: False).
      gpu_memory_fraction: Fraction of GPU memory used by the process on
        each GPU uniformly on the same machine.
      tf_random_seed: Random seed for TensorFlow initializers.
        Setting this value allows consistency between reruns.
      save_summary_steps: Save summaries every this many steps.
      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
          be specified with `save_checkpoints_steps`.
      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
          specified with `save_checkpoints_secs`.
      keep_checkpoint_max: The maximum number of recent checkpoint files to
        keep. As new files are created, older files are deleted. If None or 0,
        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
        checkpoint files are kept.)
      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
        to be saved. The default value of 10,000 hours effectively disables
        the feature.
      evaluation_master: the master on which to perform evaluation.
      model_dir: directory where model parameters, graph etc are saved. If
        `None`, see `Estimator` about where the model will be saved.
      session_config: a ConfigProto used to set session parameters, or None.
         Note - using this argument, it is easy to provide settings which break
         otherwise perfectly good models. Use with care.
    """
        super(RunConfig, self).__init__(master=master,
                                        evaluation_master=evaluation_master)

        gpu_options = config_pb2.GPUOptions(
            per_process_gpu_memory_fraction=gpu_memory_fraction)
        self._tf_config = config_pb2.ConfigProto(
            log_device_placement=log_device_placement,
            inter_op_parallelism_threads=num_cores,
            intra_op_parallelism_threads=num_cores,
            gpu_options=gpu_options)

        self._tf_random_seed = tf_random_seed
        self._save_summary_steps = save_summary_steps
        self._save_checkpoints_secs = save_checkpoints_secs
        self._session_config = session_config
        if save_checkpoints_secs == RunConfig._USE_DEFAULT:
            if save_checkpoints_steps is None:
                self._save_checkpoints_secs = 600
            else:
                self._save_checkpoints_secs = None
        self._save_checkpoints_steps = save_checkpoints_steps

        # TODO(weiho): Remove these after ModelFn refactoring, when users can
        # create Scaffold and Saver in their model_fn to set these.
        self._keep_checkpoint_max = keep_checkpoint_max
        self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
        self._model_dir = model_dir
예제 #15
0
    def common_run_context(self, var_list, opt_list, name):
        batch_size = 2
        sample_length = 3
        emb_domain_list = list()
        tws = list()

        cluster = ps_worker_cluster(ps_num=2)
        ps_servers, worker_servers, cluster_def = cluster

        config = config_pb2.ConfigProto(
            cluster_def=cluster_def,
            experimental=config_pb2.ConfigProto.Experimental(
                share_session_state_in_clusterspec_propagation=True, ),
            allow_soft_placement=False,
            inter_op_parallelism_threads=2,
            intra_op_parallelism_threads=2,
            gpu_options=config_pb2.GPUOptions(allow_growth=True),
        )

        dev_placement = device_setter.replica_device_setter(
            ps_tasks=2,
            ps_device='/job:ps',
            worker_device='/job:worker',
            cluster=cluster_def,
        )

        with ops.device(dev_placement):
            shared_var_0 = deo.get_variable('distributed_sp_var_0',
                                            initializer=0.0,
                                            devices=['/job:worker/task:0'],
                                            dim=8)
            shared_var_1 = deo.get_variable('distributed_sp_var_1',
                                            initializer=0.0,
                                            devices=['/job:worker/task:0'],
                                            dim=4)
            opt_list = get_multiple_optimizers()

            distributed_var_list = [shared_var_0, shared_var_1]
            for _v in distributed_var_list:
                ids = random_ops.random_uniform((batch_size, sample_length),
                                                maxval=1000000,
                                                dtype=_v.key_dtype)
                ids = array_ops.reshape(ids, (-1, ))

                _, tw = deo.embedding_lookup(_v, ids, return_trainable=True)
                tws.append(tw)
                _collapse = array_ops.reshape(tw, (batch_size, -1))
                _logits = math_ops.reduce_sum(_collapse, axis=1)
                _logits = math_ops.cast(_logits, dtypes.float32)
                emb_domain_list.append(_logits)
            logits = math_ops.add_n(emb_domain_list)

            labels = array_ops.zeros((batch_size, ), dtype=dtypes.float32)
            loss = math_ops.reduce_mean(
                nn_impl.sigmoid_cross_entropy_with_logits(
                    logits=logits,
                    labels=labels,
                ))

            _train_ops = list()
            for _opt in opt_list:
                _train_ops.append(_opt.minimize(loss))
            train_op = control_flow_ops.group(_train_ops)

            restrictor = dvr.VariableRestrictor(var_list=distributed_var_list,
                                                optimizer_list=opt_list)
            update_op = restrictor.update()
            threshold = int(batch_size * sample_length * 1.5)
            factor = 1.2
            restrict_op = restrictor.restrict(threshold=threshold,
                                              factor=factor)

        policies = list(itertools.chain(*restrictor.policy_group.values()))
        tstp_vars = [policy.tstp_var for policy in policies]
        slot_vars = list()
        for tw in tws:
            for opt in opt_list:
                slot_vars += select_slot_vars(tw, opt)

        with session.Session(worker_servers[0].target, config=config) as sess:
            sess.run(variables.global_variables_initializer())
            n, MAX_ITER = 0, 1000
            while n < MAX_ITER:
                sess.run([train_op, update_op])
                if all(
                        sess.run(var.size()) > threshold * factor
                        for var in distributed_var_list):
                    break

            s1 = sess.run([var.size() for var in distributed_var_list])
            s2 = sess.run([tv.size() for tv in tstp_vars])
            s3 = sess.run([sv.size() for sv in slot_vars])

            self.assertAllGreater(s1, threshold * factor)
            self.assertAllGreater(s2, threshold * factor)
            if s3:
                self.assertAllGreater(s3, threshold * factor)

            sess.run(restrict_op)
            s1 = sess.run([var.size() for var in distributed_var_list])
            s2 = sess.run([tv.size() for tv in tstp_vars])
            s3 = sess.run([sv.size() for sv in slot_vars])

            self.assertAllLess(s1, threshold * factor + 1)
            self.assertAllLess(s2, threshold * factor + 1)
            if s3:
                self.assertAllLess(s3, threshold * factor + 1)
            sess.close()
예제 #16
0
import tensorflow as tf
from tensorflow.core.protobuf import config_pb2
from tensorflow.python.client import device_lib

virtual_device_gpu_options = config_pb2.GPUOptions(
    visible_device_list='0',
    experimental=config_pb2.GPUOptions.Experimental(virtual_devices=[
        config_pb2.GPUOptions.Experimental.VirtualDevices(
            memory_limit_mb=[200, 300])
    ]))
config = config_pb2.ConfigProto(gpu_options=virtual_device_gpu_options)

device_lib.list_local_devices(session_config=config)

with tf.Session(config=config) as sess:
    with tf.device('/gpu:1'):
        result = sess.run(tf.constand(42))
    def common_minimize_trainable(self, base_opt, test_opt, name):
        base_opt = de.DynamicEmbeddingOptimizer(base_opt)
        test_opt = de.DynamicEmbeddingOptimizer(test_opt)
        id = 0
        config = config_pb2.ConfigProto(
            allow_soft_placement=True,
            gpu_options=config_pb2.GPUOptions(allow_growth=True),
        )
        for (
                num_shards,
                k_dtype,
                d_dtype,
                initial_mode,
                dim,
                run_step,
        ) in itertools.product(
            [1, 2],
            [dtypes.int64],
            [
                dtypes.float32,
            ],
            [
                "constant",
            ],
            [1, 10],
            [10],
        ):
            with self.session(config=config,
                              use_gpu=test_util.is_gpu_available()):
                id += 1
                raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                raw_init_vals = [
                    [
                        x,
                    ] * dim for x in
                    [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81]
                ]
                raw_ids = constant_op.constant([1, 3, 3, 9], dtype=k_dtype)
                sp_ids = sparse_tensor.SparseTensor(
                    indices=[
                        [0, 0],
                        [0, 1],
                        [1, 0],
                        [2, 1],
                    ],
                    values=raw_ids,
                    dense_shape=[3, 2],
                )
                x = constant_op.constant([[_x * dim]
                                          for _x in [[0.4], [0.5], [0.6]]],
                                         dtype=d_dtype)
                x = array_ops.reshape(x, shape=(3 * dim, 1))
                # base var prepare
                base_var = variables.Variable(
                    np.array(raw_init_vals).reshape([len(raw_init_ids), dim]),
                    dtype=d_dtype,
                    shape=[len(raw_init_ids), dim],
                )
                base_embedding = embedding_ops.safe_embedding_lookup_sparse(
                    base_var, sp_ids, None, combiner="sum")
                base_embedding = array_ops.reshape(base_embedding,
                                                   shape=[1, 3 * dim])
                pred0 = math_ops.matmul(base_embedding, x)
                loss0 = pred0 * pred0

                base_opt_op = base_opt.minimize(loss0, var_list=[base_var])

                # test var prepare
                embeddings = de.get_variable(
                    "s6030-" + name + str(id),
                    key_dtype=k_dtype,
                    value_dtype=d_dtype,
                    devices=_get_devices() * num_shards,
                    initializer=1.0,
                    dim=dim,
                )
                self.device_check(embeddings)

                init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype)
                init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype)
                init_op = embeddings.upsert(init_ids, init_vals)
                self.evaluate(init_op)

                # test branch
                test_var, trainable = de.safe_embedding_lookup_sparse(
                    embeddings,
                    sp_ids,
                    sparse_weights=None,
                    combiner="sum",
                    return_trainable=True,
                )

                pred1 = math_ops.matmul(
                    array_ops.reshape(test_var, shape=[1, 3 * dim]), x)
                loss1 = pred1 * pred1
                test_opt_op = test_opt.minimize(loss1, var_list=[trainable])

                self.evaluate(variables.global_variables_initializer())

                self.assertAllCloseAccordingToType(
                    np.array(raw_init_vals).reshape([len(raw_init_ids), dim]),
                    self.evaluate(base_var),
                )

                # run base
                for _ in range(run_step):
                    self.evaluate(base_opt_op)

                # Run `run_step` step of sgd
                for _ in range(run_step):
                    self.evaluate(test_opt_op)

                table_var = array_ops.reshape(embeddings.lookup(init_ids),
                                              shape=[10, dim])
                # Validate updated params
                self.assertAllCloseAccordingToType(
                    self.evaluate(base_var),
                    self.evaluate(table_var),
                    msg="Cond:{},{},{},{},{}".format(num_shards, k_dtype,
                                                     d_dtype, dim, run_step),
                )
예제 #18
0
    def __init__(self,
                 master=None,
                 num_cores=0,
                 log_device_placement=False,
                 gpu_memory_fraction=1,
                 tf_random_seed=None,
                 save_summary_steps=100,
                 save_checkpoints_secs=_USE_DEFAULT,
                 save_checkpoints_steps=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 protocol=None,
                 evaluation_master='',
                 model_dir=None,
                 session_config=None):
        """Constructor.

    The superclass `ClusterConfig` may set properties like `cluster_spec`,
    `is_chief`, `master` (if `None` in the args), `num_ps_replicas`, `task_id`,
    and `task_type` based on the `TF_CONFIG` environment variable. See
    `ClusterConfig` for more details.

    N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set,
    `keep_checkpoint_max` might need to be adjusted accordingly, especially in
    distributed training. For example, setting `save_checkpoints_secs` as 60
    without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation
    that checkpoint would be garbage collected after 5 minutes. In distributed
    training, the evaluation job starts asynchronously and might fail to load or
    find the checkpoint due to race condition.

    Args:
      master: TensorFlow master. Defaults to empty string for local.
      num_cores: Number of cores to be used. If 0, the system picks an
        appropriate number (default: 0).
      log_device_placement: Log the op placement to devices (default: False).
      gpu_memory_fraction: Fraction of GPU memory used by the process on
        each GPU uniformly on the same machine.
      tf_random_seed: Random seed for TensorFlow initializers.
        Setting this value allows consistency between reruns.
      save_summary_steps: Save summaries every this many steps.
      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
          be specified with `save_checkpoints_steps`.
      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
          specified with `save_checkpoints_secs`.
      keep_checkpoint_max: The maximum number of recent checkpoint files to
        keep. As new files are created, older files are deleted. If None or 0,
        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
        checkpoint files are kept.)
      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
        to be saved. The default value of 10,000 hours effectively disables
        the feature.
      log_step_count_steps: The frequency, in number of global steps, that the
        global step/sec will be logged during training.
      evaluation_master: the master on which to perform evaluation.
      model_dir: directory where model parameters, graph etc are saved. If
        `None`, will use `model_dir` property in `TF_CONFIG` environment
        variable. If both are set, must have same value. If both are `None`, see
        `Estimator` about where the model will be saved.
      session_config: a ConfigProto used to set session parameters, or None.
        Note - using this argument, it is easy to provide settings which break
        otherwise perfectly good models. Use with care.
      protocol: An optional argument which specifies the protocol used when
        starting server. None means default to grpc.
    """
        # Neither parent class calls super().__init__(), so here we have to
        # manually call their __init__() methods.
        ClusterConfig.__init__(self,
                               master=master,
                               evaluation_master=evaluation_master)
        # For too long this code didn't call:
        #   core_run_config.RunConfig.__init__(self)
        # so instead of breaking compatibility with that assumption, we
        # just manually initialize this field:
        self._train_distribute = None
        self._eval_distribute = None
        self._experimental_max_worker_delay_secs = None
        self._device_fn = None

        gpu_options = config_pb2.GPUOptions(
            per_process_gpu_memory_fraction=gpu_memory_fraction)
        self._tf_config = config_pb2.ConfigProto(
            log_device_placement=log_device_placement,
            inter_op_parallelism_threads=num_cores,
            intra_op_parallelism_threads=num_cores,
            gpu_options=gpu_options)

        self._tf_random_seed = tf_random_seed
        self._save_summary_steps = save_summary_steps
        self._save_checkpoints_secs = save_checkpoints_secs
        self._log_step_count_steps = log_step_count_steps
        self._protocol = protocol
        self._session_config = session_config
        if save_checkpoints_secs == RunConfig._USE_DEFAULT:
            if save_checkpoints_steps is None:
                self._save_checkpoints_secs = 600
            else:
                self._save_checkpoints_secs = None
        self._save_checkpoints_steps = save_checkpoints_steps

        # TODO(weiho): Remove these after ModelFn refactoring, when users can
        # create Scaffold and Saver in their model_fn to set these.
        self._keep_checkpoint_max = keep_checkpoint_max
        self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
        self._model_dir = _get_model_dir(model_dir)
예제 #19
0
    def common_minimize_trainable(self, base_opt, test_opt, name):
        if test_util.is_gpu_available():
            keys_type_list = [dtypes.int64]
        else:
            keys_type_list = [dtypes.int64, dtypes.string]
        deo.enable_train_mode()
        config = config_pb2.ConfigProto(
            allow_soft_placement=True,
            gpu_options=config_pb2.GPUOptions(allow_growth=True))
        for run_id, num_shards, k_dtype, d_dtype, initial_mode, dim, run_step \
            in _next_run_step_config(keys_type_list):
            with self.session(config=config,
                              use_gpu=test_util.is_gpu_available()):
                raw_init_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                if k_dtype == dtypes.string:
                    raw_init_ids = [str(i) for i in raw_init_ids]

                raw_init_vals = [
                    [
                        x,
                    ] * dim for x in
                    [0.0, 0.1, 0.3, 0.8, 0.16, 0.25, 0.36, 0.49, 0.64, 0.81]
                ]
                raw_ids_py = [1, 3, 3, 9]
                raw_ids_nn = constant_op.constant(raw_ids_py,
                                                  dtype=dtypes.int64)
                sp_ids_nn = sparse_tensor.SparseTensor(indices=[
                    [0, 0],
                    [0, 1],
                    [1, 0],
                    [2, 1],
                ],
                                                       values=raw_ids_nn,
                                                       dense_shape=[3, 2])
                if k_dtype != dtypes.string:
                    raw_ids_de = raw_ids_nn
                else:
                    raw_ids_de = constant_op.constant(
                        [str(i) for i in raw_ids_py], dtype=k_dtype)

                sp_ids_de = sparse_tensor.SparseTensor(indices=[
                    [0, 0],
                    [0, 1],
                    [1, 0],
                    [2, 1],
                ],
                                                       values=raw_ids_de,
                                                       dense_shape=[3, 2])
                x = constant_op.constant([[_x * dim]
                                          for _x in [[0.4], [0.5], [0.6]]],
                                         dtype=d_dtype)
                x = array_ops.reshape(x, shape=(3 * dim, 1))
                # base var prepare
                base_var = variables.Variable(np.array(raw_init_vals).reshape(
                    [len(raw_init_ids), dim]),
                                              dtype=d_dtype,
                                              shape=[len(raw_init_ids), dim])
                base_embedding = embedding_ops.safe_embedding_lookup_sparse(
                    base_var, sp_ids_nn, None, combiner='sum')
                base_embedding = array_ops.reshape(base_embedding,
                                                   shape=[1, 3 * dim])
                pred0 = math_ops.matmul(base_embedding, x)
                loss0 = pred0 * pred0

                base_opt_op = base_opt.minimize(loss0, var_list=[base_var])

                # test var prepare
                embeddings = deo.get_variable('s6030-' + name + str(run_id),
                                              key_dtype=k_dtype,
                                              value_dtype=d_dtype,
                                              devices=_get_devices() *
                                              num_shards,
                                              initializer=1.,
                                              dim=dim)
                self.device_check(embeddings)

                init_ids = constant_op.constant(raw_init_ids, dtype=k_dtype)
                init_vals = constant_op.constant(raw_init_vals, dtype=d_dtype)
                init_op = embeddings.upsert(init_ids, init_vals)
                self.evaluate(init_op)

                # test branch
                test_var, trainable = deo.safe_embedding_lookup_sparse(
                    embeddings,
                    sp_ids_de,
                    sparse_weights=None,
                    combiner="sum",
                    return_trainable=True)

                pred1 = math_ops.matmul(
                    array_ops.reshape(test_var, shape=[1, 3 * dim]), x)
                loss1 = pred1 * pred1
                test_opt_op = test_opt.minimize(loss1, var_list=[trainable])

                self.evaluate(variables.global_variables_initializer())

                self.assertAllCloseAccordingToType(
                    np.array(raw_init_vals).reshape([len(raw_init_ids), dim]),
                    self.evaluate(base_var))

                # run base
                for _ in range(run_step):
                    self.evaluate(base_opt_op)

                # Run `run_step` step of sgd
                for _ in range(run_step):
                    self.evaluate(test_opt_op)

                table_var = array_ops.reshape(embeddings.lookup(init_ids),
                                              shape=[10, dim])
                # Validate updated params
                self.assertAllCloseAccordingToType(
                    self.evaluate(base_var),
                    self.evaluate(table_var),
                    msg="Cond:{},{},{},{},{}".format(num_shards, k_dtype,
                                                     d_dtype, dim, run_step))