Пример #1
0
  def __call__(self, getter, name, *args, **kwargs):
    staging_ops = self.variable_mgr.staging_vars_on_devices[self.device_num]
    if name in staging_ops:
      put_op, get_op = staging_ops[name]
      return get_op
    real_var = getter(name, *args, **kwargs)
    shape = kwargs['shape']
    dtype = kwargs['dtype']
    trainable = kwargs['trainable']
    if self.cpu_device:
      with tf.device(self.cpu_device):
        # This helps copying the weights from the parameter to this server only
        # once.
        if name in self.variable_mgr.staged_vars_on_cpu:
          cpu_var = self.variable_mgr.staged_vars_on_cpu[name]
        else:
          cpu_var = tf.identity(real_var)
          self.variable_mgr.staged_vars_on_cpu[name] = cpu_var
      var_to_stage = cpu_var
    else:
      var_to_stage = tf.identity(real_var)  # de-reference the variable.

    with tf.device(self.devices[self.device_num]):
      staging_area = data_flow_ops.StagingArea([dtype], shapes=[shape])
      put_op = staging_area.put([var_to_stage])
      get_op = staging_area.get()[0]
      staging_ops[name] = (put_op, get_op)
    if trainable:
      # For trainable variables, they are managed separatedly through
      # apply_gradients.
      return get_op
    else:
      # For other shadow variables, the access is decoupled through a wrapper
      # class.
      return StagedModelVariable(real_var, get_op, self.variable_mgr)
Пример #2
0
def _defer_tensor(tensor):
    """Defers the retrieval of a tensor.

  The tensor is put into a StagingArea, and the return value is the
  retrieval of the tensor from the StagingArea. The effect is that the
  tensor returned from this function is the tensor that was put in the
  StagingArea for the previous Session.run() call.

  Args:
    tensor: The tensor to defer for one step.

  Returns:
    deferred_tensor: The tensor deferred for one step.
    put_op: An op to put `tensor` in the StagingArea. Must be run every step
      that `deferred_tensor` is run.
    warmup_op: A warmup op that should be called before the first step. Puts
      a zero tensor into the StagingArea.
  """
    tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape])
    put_op = tensor_stage.put([tensor])
    warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)])

    # Fetch the next tensor to use.
    (tensor, ) = tensor_stage.get()
    return tensor, put_op, warmup_op
Пример #3
0
    def testDictionary(self):
        with ops.Graph().as_default() as G:
            with ops.device('/cpu:0'):
                x = array_ops.placeholder(dtypes.float32)
                v = 2. * (array_ops.zeros([128, 128]) + x)
            with ops.device(test.gpu_device_name()):
                stager = data_flow_ops.StagingArea(
                    [dtypes.float32, dtypes.float32],
                    shapes=[[], [128, 128]],
                    names=['x', 'v'])
                stage = stager.put({'x': x, 'v': v})
                ret = stager.get()
                z = ret['x']
                y = ret['v']
                y = math_ops.reduce_max(z * math_ops.matmul(y, y))

        G.finalize()

        with self.session(use_gpu=True, graph=G) as sess:
            sess.run(stage, feed_dict={x: -1})
            for i in range(10):
                _, yval = sess.run([stage, y], feed_dict={x: i})
                self.assertAllClose(4 * (i - 1) * (i - 1) * (i - 1) * 128,
                                    yval,
                                    rtol=1e-4)
Пример #4
0
  def testCapacity(self):
    capacity = 3

    with ops.Graph().as_default() as G:
      with ops.device('/cpu:0'):
        x = array_ops.placeholder(dtypes.int32, name='x')
      with ops.device(test.gpu_device_name()):
        stager = data_flow_ops.StagingArea(
            [
                dtypes.int32,
            ], capacity=capacity, shapes=[[]])
        stage = stager.put([x])
        ret = stager.get()
        size = stager.size()

    G.finalize()

    from six.moves import queue as Queue
    import threading

    queue = Queue.Queue()
    n = 8

    with self.test_session(use_gpu=True, graph=G) as sess:
      # Stage data in a separate thread which will block
      # when it hits the staging area's capacity and thus
      # not fill the queue with n tokens
      def thread_run():
        for i in range(n):
          sess.run(stage, feed_dict={x: i})
          queue.put(0)

      t = threading.Thread(target=thread_run)
      t.daemon = True
      t.start()

      # Get tokens from the queue until a timeout occurs
      try:
        for i in range(n):
          queue.get(timeout=TIMEOUT)
      except Queue.Empty:
        pass

      # Should've timed out on the iteration 'capacity'
      if not i == capacity:
        self.fail("Expected to timeout on iteration '{}' "
                  "but instead timed out on iteration '{}' "
                  "Staging Area size is '{}' and configured "
                  "capacity is '{}'.".format(capacity, i, sess.run(size),
                                             capacity))

      # Should have capacity elements in the staging area
      self.assertTrue(sess.run(size) == capacity)

      # Clear the staging area completely
      for i in range(n):
        self.assertTrue(sess.run(ret) == [i])

      # It should now be empty
      self.assertTrue(sess.run(size) == 0)
Пример #5
0
    def assign_sub(self, delta, name=None):
        """Mimic the updates to the variable.

    Args:
      delta: is pushed into a staging buffer and will be pumped later.
      name: currently ignored; names of ops and the StagingArea are
            computed without using this pass name.
    Returns:
      The actual updates. The colocation constraint will be reapplied.
    """
        # This parameter is ignored: the StagingArea only supports setting
        # the shared name, not the names of individual ops it uses.
        del name

        # colocate_with(None, True) clears the colocation constraints.
        # Push the delta into a staging buffer.
        with ops.colocate_with(None,
                               True), tf.device(self.var_stage_get.device):
            delta_staging_area = data_flow_ops.StagingArea(
                [self.var_stage_get.dtype], shapes=[self.var_stage_get.shape])
            delta_put_op = delta_staging_area.put([delta])
            self.variable_mgr.staging_delta_ops.append(delta_put_op)
            delta_get_op = delta_staging_area.get()[0]
        # Return the actual updates. The colocation constraint will be reapplied.
        return self.real_var.assign_sub(delta_get_op)
Пример #6
0
        def slice_batch(x, ngpus, part, dev):
            '''Divide the input batch into [ngpus] slices, and obtain slice
            no. [part]. i.e. if len(x)=10, then slice_batch(x, 2, 1) will
            return x[5:].
            '''
            sh = KB.shape(x)
            L = sh[0] // ngpus
            if part == ngpus - 1:
                xslice = x[part * L:]
            else:
                xslice = x[part * L:(part + 1) * L]

            # tf.split fails if batch size is not divisible by ngpus. Error:
            #     InvalidArgumentError (see above for traceback): Number of
            #         ways to split should evenly divide the split dimension
            # xslice = tf.split(x, ngpus)[part]

            if not self._enqueue:
                return xslice

            # Did not see any benefit.
            with tf.device(dev):
                # if self._stager is None:
                stager = data_flow_ops.StagingArea(dtypes=[xslice.dtype],
                                                   shapes=[xslice.shape])
                stage = stager.put([xslice])
                gpucopy_ops.append(stage)
                # xslice_stage = stager.get()
            return stager.get()
Пример #7
0
    def __call__(self, getter, name, *args, **kwargs):
        name_split = name.split('/', 2)
        worker_index = int(name_split[1].split('_')[1])
        gpu_index = int(name_split[1].split('_')[2])
        name_without_tower = name_split[0] + '/' + name_split[2]

        if (name_without_tower in self._global_variable):
            global_var = self._global_variable[name_without_tower]
        else:
            min_size_device, _ = min(enumerate(self._local_sizes),
                                     key=operator.itemgetter(1))
            with tf.device(self._cpu_device[min_size_device]):
                global_var = getter(name_without_tower, *args, **kwargs)
            self._local_sizes[min_size_device] += global_var.get_shape(
            ).num_elements()
            self._global_variable[name_without_tower] = global_var

        if self._use_staging:
            shape = kwargs['shape']
            dtype = kwargs['dtype']
            with tf.name_scope("Benchmark_Net/Input_Staging/Staging"):
                staging_var = data_flow_ops.StagingArea([dtype], [shape])
                put_op = staging_var.put(tf.identity(global_var))
                get_op = staging_var.get()[0]
                self._staging_put_ops.append(put_op)
                self._local_variable[worker_index][gpu_index][
                    name_without_tower] = get_op
        else:
            self._local_variable[worker_index][gpu_index][
                name_without_tower] = global_var

        return self._local_variable[worker_index][gpu_index][
            name_without_tower]
Пример #8
0
    def _defer_gradient(self, grad, tower_num):
        """Defers the retrieval of a gradient.

    The gradient is put into a StagingArea, and the return value is the
    retrieval of the gradient from the StagingArea. The effect is that the
    gradient returned from this function is the gradient computed from the
    previous step.

    The put op is put in self._gradient_put_ops[tower_num], which must be run
    every step. self._gradient_put_ops must be set to a list of lists before
    this function is run. A warmup op to fill the StagingArea with a zero
    gradient is added to self._warmup_ops, which must be run before the first
    step.

    Args:
      grad: The gradient tensor to defer for one step.
      tower_num: The tower that computed the gradient.

    Returns:
      The gradient, deferred for one step.
    """
        gradient_stage = data_flow_ops.StagingArea([grad.dtype], [grad.shape])

        # Push the gradient into the staging area.
        gradient_put_op = gradient_stage.put([grad])
        self._gradient_put_ops[tower_num].append(gradient_put_op)

        # Push an empty gradient into the staging area.
        warmup_op = gradient_stage.put(
            [tf.zeros(grad.shape, dtype=grad.dtype)])
        self._warmup_ops.append(warmup_op)

        # Fetch the next gradient to ues.
        (grad, ) = gradient_stage.get()
        return grad
Пример #9
0
    def testMemoryLimit(self):
        memory_limit = 512 * 1024  # 512K
        chunk = 200 * 1024  # 256K
        capacity = memory_limit // chunk

        with ops.Graph().as_default() as G:
            with ops.device('/cpu:0'):
                x = array_ops.placeholder(dtypes.uint8, name='x')
            with ops.device(test.gpu_device_name()):
                stager = data_flow_ops.StagingArea([
                    dtypes.uint8,
                ],
                                                   memory_limit=memory_limit,
                                                   shapes=[[]])
                stage = stager.put([x])
                ret = stager.get()
                size = stager.size()

        G.finalize()

        value_queue = queue.Queue()
        n = 8

        with self.session(graph=G) as sess:
            # Stage data in a separate thread which will block when it hits the
            # staging area's capacity and thus not fill the value_queue with n tokens
            def thread_run():
                for i in range(n):
                    sess.run(stage,
                             feed_dict={x: np.full(chunk, i, dtype=np.uint8)})
                    value_queue.put(0)

            t = threading.Thread(target=thread_run)
            t.daemon = True
            t.start()

            # Get tokens from the value_queue until a timeout occurs
            try:
                for i in range(n):
                    value_queue.get(timeout=TIMEOUT)
            except queue.Empty:
                pass

            # Should've timed out on the iteration 'capacity'
            if not i == capacity:
                self.fail("Expected to timeout on iteration '{}' "
                          "but instead timed out on iteration '{}' "
                          "Staging Area size is '{}' and configured "
                          "capacity is '{}'.".format(capacity, i,
                                                     sess.run(size), capacity))

            # Should have capacity elements in the staging area
            self.assertTrue(sess.run(size) == capacity)

            # Clear the staging area completely
            for i in range(n):
                self.assertTrue(np.all(sess.run(ret)[0] == i))

            self.assertTrue(sess.run(size) == 0)
Пример #10
0
  def testMemoryLimit(self):
    memory_limit = 512*1024  # 512K
    chunk = 200*1024 # 256K
    capacity = memory_limit // chunk

    with ops.device('/cpu:0'):
      x = array_ops.placeholder(dtypes.uint8, name='x')
    with ops.device(test.gpu_device_name()):
      stager = data_flow_ops.StagingArea([dtypes.uint8, ],
        memory_limit=memory_limit, shapes=[[]])
      stage = stager.put([x])
      ret = stager.get()
      size = stager.size()

    from six.moves import queue as Queue
    import threading
    import numpy as np

    queue = Queue.Queue()
    n = 5
    missed = 0

    with self.test_session(use_gpu=True) as sess:
      # Stage data in a separate thread which will block
      # when it hits the staging area's capacity and thus
      # not fill the queue with n tokens
      def thread_run():
        for i in range(n):
          sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8)})
          queue.put(0)

      t = threading.Thread(target=thread_run)
      t.start()

      # Get tokens from the queue, making notes of when we timeout
      for i in range(n):
        try:
          queue.get(timeout=0.05)
        except Queue.Empty:
          missed += 1

      # We timed out n - capacity times waiting for queue puts
      self.assertTrue(missed == n - capacity)

      # Clear the staging area out a bit
      for i in range(n - capacity):
        self.assertTrue(sess.run(ret)[0] == i)

      # Thread should be able to join now
      t.join()

      self.assertTrue(sess.run(size) == capacity)

      # Clear the staging area completely
      for i in range(capacity):
        self.assertTrue(sess.run(ret)[0] == i+(n-capacity))

      self.assertTrue(sess.run(size) == 0)
Пример #11
0
def input_fn(tf_glob,
             one_hot=True,
             classes=None,
             is_training=None,
             batch_shape=[32, 224, 224, 3],
             parallelism=1):
    """ Return tensor to read from TFRecord """
    print('Creating graph for loading %s TFRecords...' % tf_glob)
    with tf.variable_scope("TFRecords"):
        record_input = data_flow_ops.RecordInput(tf_glob,
                                                 batch_size=batch_shape[0],
                                                 parallelism=parallelism)
        records_op = record_input.get_yield_op()
        records_op = tf.split(records_op, batch_shape[0], 0)
        records_op = [tf.reshape(record, []) for record in records_op]
        progbar = Progbar(len(records_op))

        images = []
        labels = []
        for i, serialized_example in enumerate(records_op):
            progbar.update(i)
            with tf.variable_scope("parse_images", reuse=True):
                features = tf.parse_single_example(
                    serialized_example,
                    features={
                        'image': tf.FixedLenFeature([], tf.string),
                        'label': tf.FixedLenFeature([], tf.int64),
                    })
                image_decoded = tf.image.decode_jpeg(features['image'],
                                                     channels=3)
                image = tf.image.convert_image_dtype(image_decoded, tf.float32)
                resized_image = tf.image.resize_images(
                    image, [batch_shape[1], batch_shape[2]])
                label = tf.cast(features['label'], tf.int32)
                if one_hot and classes:
                    label = tf.one_hot(label, classes)

                images.append(resized_image)
                labels.append(label)

        images = tf.parallel_stack(images, 0)
        labels = tf.parallel_stack(labels, 0)
        #         images = tf.cast(images, tf.float32)

        #         images = tf.reshape(images, shape=batch_shape)

        # StagingArea will store tensors
        # across multiple steps to
        # speed up execution
        images_shape = images.get_shape()
        labels_shape = labels.get_shape()
        copy_stage = data_flow_ops.StagingArea(
            [tf.float32, tf.float32], shapes=[images_shape, labels_shape])
        copy_stage_op = copy_stage.put([images, labels])
        staged_images, staged_labels = copy_stage.get()

        return images, labels
Пример #12
0
    def compute_gradient_and_apply(self, gradients_list, global_step,
                                   learning_rate, optimizer):

        if self._use_staging:
            input_staging_op = tf.group(self._staging_put_ops)
            gradients_put_op = []
            gradients_get_op = [list() for _ in self._gpu_devices]
            for index, gradients in enumerate(gradients_list):
                with tf.device(self._gpu_devices[index]), tf.name_scope(
                        "Gradient_Staging/Staging"):
                    if isinstance(gradients[0], ops.IndexedSlices):
                        gradients[0] = tf.convert_to_tensor(gradients[0])
                    dtypes = [g.dtype for g in gradients]
                    shapes = [g.shape for g in gradients]
                    staging_var = data_flow_ops.StagingArea(dtypes, shapes)
                    gradients_put_op.append(staging_var.put(gradients))
                    gradients_get_op[index] = staging_var.get()
            gradients_list = gradients_get_op

        with tf.name_scope('Gradient_Update'):
            global_varis = self.get_global_variable()
            # print(global_varis[0])
            # for gg in gradients_list:
            #     print(gg[0])
            #     gg[0] = tf.convert_to_tensor(gg[0])
            #     print(gg[0])
            #global_varis = global_varis[1:]
            print(global_varis.__len__())
            if self._num_gpus > 1:
                apply_list = []
                for g_v in zip(*gradients_list, global_varis):
                    grads = g_v[:self._num_gpus]
                    varis = g_v[self._num_gpus]
                    #print(varis)
                    # Some variable in BatchNorm do not have gradient
                    if grads[0] != None:
                        with tf.device(varis.device):
                            if isinstance(grads[0], ops.IndexedSlices):
                                print(grads)
                            #     grads = tf.convert_to_tensor(grads)
                            average_grad = tf.multiply(tf.add_n(grads),
                                                       1.0 / self._num_gpus)
                            apply = optimizer.apply_gradients([(average_grad,
                                                                varis)])
                            apply_list.append(apply)
                with tf.device(global_step.device):
                    apply_list.append(global_step.assign_add(1))
                apply_op = tf.group(apply_list)
            else:
                grads_and_varis = list(zip(gradients_list[0], global_varis))
                apply_op = optimizer.apply_gradients(grads_and_varis,
                                                     global_step)

        if self._use_staging:
            return [input_staging_op, gradients_put_op, apply_op]
        else:
            return [apply_op]
Пример #13
0
  def testPeekBadIndex(self):
    stager = data_flow_ops.StagingArea([
        dtypes.int32,
    ], shapes=[[10]])
    stager.put([array_ops.zeros([10], dtype=dtypes.int32)])

    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
                                'must be scalar'):
      self.evaluate(stager.peek([]))
def stage(tensors):
    """Stages the given tensors in a StagingArea for asynchronous put/get.
    """
    stage_area = data_flow_ops.StagingArea(
        dtypes=[tensor.dtype for tensor in tensors],
        shapes=[tensor.get_shape() for tensor in tensors])
    put_op = stage_area.put(tensors)
    get_tensors = stage_area.get()
    tf.add_to_collection('STAGING_AREA_PUTS', put_op)
    return put_op, get_tensors
Пример #15
0
def read_and_decode_recordinput(tf_glob, one_hot=True, classes=None, is_train=None,
                                batch_shape=[1000, 28, 28, 1], parallelism=1):
    """ Return tensor to read from TFRecord """
    print 'Creating graph for loading %s TFRecords...' % tf_glob
    with tf.variable_scope("TFRecords"):
        record_input = data_flow_ops.RecordInput(
            tf_glob, batch_size=batch_shape[0], parallelism=parallelism)
        records_op = record_input.get_yield_op()
        records_op = tf.split(records_op, batch_shape[0], 0)
        records_op = [tf.reshape(record, []) for record in records_op]
        progbar = Progbar(len(records_op))

        images = []
        labels = []
        for i, serialized_example in enumerate(records_op):
            progbar.update(i)
            with tf.variable_scope("parse_images", reuse=True):
                features = tf.parse_single_example(
                    serialized_example,
                    features={
                        'label': tf.FixedLenFeature([], tf.int64),
                        'image_raw': tf.FixedLenFeature([], tf.string),
                    })
                img = tf.decode_raw(features['image_raw'], tf.uint8)
                img.set_shape(batch_shape[1] * batch_shape[2])
                img = tf.reshape(img, [1] + batch_shape[1:])

                img = tf.cast(img, tf.float32) * (1. / 255) - 0.5

                label = tf.cast(features['label'], tf.int32)
                if one_hot and classes:
                    label = tf.one_hot(label, classes)

                images.append(img)
                labels.append(label)

        images = tf.parallel_stack(images, 0)
        labels = tf.parallel_stack(labels, 0)
        images = tf.cast(images, tf.float32)

        images = tf.reshape(images, shape=batch_shape)

        # StagingArea will store tensors
        # across multiple steps to
        # speed up execution
        images_shape = images.get_shape()
        labels_shape = labels.get_shape()
        copy_stage = data_flow_ops.StagingArea(
            [tf.float32, tf.float32],
            shapes=[images_shape, labels_shape])
        copy_stage_op = copy_stage.put(
            [images, labels])
        staged_images, staged_labels = copy_stage.get()
        print(images, labels)
        return images, labels
Пример #16
0
 def testColocation1(self):
   with ops.device('/cpu:0'):
     x = array_ops.placeholder(dtypes.float32)
     v = 2. * (array_ops.zeros([128, 128]) + x)
   with ops.device('/gpu:0'):
     stager = data_flow_ops.StagingArea([dtypes.float32])
     y = stager.put([v])
     self.assertEqual(y.device, '/device:GPU:0')
   with ops.device('/cpu:0'):
     x = stager.get()
     self.assertEqual(x.device, '/device:CPU:0')
Пример #17
0
def stage(tensors):
    stage_area = data_flow_ops.StagingArea(
        dtypes=[tensor.dtype for tensor in tensors],
        shapes=[tensor.get_shape() for tensor in tensors])
    put_op = stage_area.put(tensors)
    get_tensors = stage_area.get()

    get_tensors = [
        tf.reshape(gt, t.get_shape()) for (gt, t) in zip(get_tensors, tensors)
    ]
    return put_op, get_tensors
Пример #18
0
    def cpu_gpu_copy(cpu_device, raw_device, host_images, host_labels,
                     gpu_copy_stage_ops, gpu_compute_stage_ops):
        with tf.device(cpu_device):
            images_shape = host_images.get_shape()
            labels_shape = host_labels.get_shape()
            gpu_copy_stage = df_ops.StagingArea(
                [tf.float32, tf.int32], shapes=[images_shape, labels_shape])
            gpu_copy_stage_op = gpu_copy_stage.put([host_images, host_labels])
            gpu_copy_stage_ops.append(gpu_copy_stage_op)
            host_images, host_labels = gpu_copy_stage.get()

        with tf.device(raw_device):
            gpu_compute_stage = df_ops.StagingArea(
                [tf.float32, tf.int32], shapes=[images_shape, labels_shape])
            # The CPU-to-GPU copy is triggered here.
            gpu_compute_stage_op = gpu_compute_stage.put(
                [host_images, host_labels])
            images, labels = gpu_compute_stage.get()
            images = tf.reshape(images, shape=images_shape)
            gpu_compute_stage_ops.append(gpu_compute_stage_op)
        return images, labels
Пример #19
0
    def compute_gradient_and_apply(self, gradients_list, global_step,
                                   learning_rate):
        optimizer = self.get_optimizer(learning_rate)

        if self._use_staging:
            input_staging_op = tf.group(self._staging_put_ops)
            gradients_put_op = []
            gradients_get_op = [list() for _ in range(self._total_gpus)]
            for index, gradients in enumerate(gradients_list):
                worker_index = (int)(index / self._num_gpus)
                gpu_index = index % self._num_gpus
                with tf.device(
                        self._gpu_devices[worker_index]
                    [gpu_index]), tf.name_scope("Gradient_Staging/Staging"):
                    dtypes = [g.dtype for g in gradients]
                    shapes = [g.shape for g in gradients]
                    staging_var = data_flow_ops.StagingArea(dtypes, shapes)
                    gradients_put_op.append(staging_var.put(gradients))
                    gradients_get_op[index] = staging_var.get()
            gradients_list = gradients_get_op

        with tf.name_scope('Gradient_Update'):
            global_varis = self.get_global_variable()

            apply_list = []
            for g_v in zip(*gradients_list, global_varis):
                grads = g_v[:self._total_gpus]
                varis = g_v[self._total_gpus]

                grad_sum_list = []
                for i in range(self._num_workers):
                    grads_in_worker = grads[i * self._num_gpus:(i + 1) *
                                            self._num_gpus]
                    if grads_in_worker[0] != None:
                        with tf.device(self._cpu_device[i]):
                            grad_sum_in_worker = tf.add_n(grads_in_worker)
                            grad_sum_list.append(grad_sum_in_worker)
                if len(grad_sum_list) > 0:
                    with tf.device(varis.device):
                        average_grad = tf.multiply(tf.add_n(grad_sum_list),
                                                   1.0 / self._total_gpus)
                        apply = optimizer.apply_gradients([(average_grad,
                                                            varis)])
                        apply_list.append(apply)

            with tf.device(global_step.device):
                apply_list.append(global_step.assign_add(1))
            apply_op = tf.group(apply_list)

        if self._use_staging:
            return [input_staging_op, gradients_put_op, apply_op]
        else:
            return [apply_op]
Пример #20
0
def stage(tensors):
    """Stages the given tensors in a StagingArea for asynchronous put/get.
    """
    stage_area = data_flow_ops.StagingArea(
        dtypes=[tensor.dtype       for tensor in tensors],
        shapes=[tensor.get_shape() for tensor in tensors])
    put_op      = stage_area.put(tensors)
    get_tensors = stage_area.get()

    get_tensors = [tf.reshape(gt, t.get_shape())
                   for (gt,t) in zip(get_tensors, tensors)]
    return put_op, get_tensors
Пример #21
0
  def get_staged_next(self):
    next_tensor = self.multi_device_iterator.get_next()

    tensor_dtypes = [ (image.dtype, label.dtype) for (image,label) in next_tensor ]
    tensor_shapes = [ (image.shape, label.shape) for (image,label) in next_tensor ]
    
    with tf.device('/cpu:0'):
      stage_area = data_flow_ops.StagingArea( dtypes=tensor_dtypes, shapes=tensor_shapes )
      put_op = stage_area.put(next_tensor) 
      get_tensors = stage_area.get()

    return put_op, get_tensors
Пример #22
0
 def testSimple(self):
   with self.test_session(use_gpu=True) as sess:
     with ops.device('/cpu:0'):
       x = array_ops.placeholder(dtypes.float32)
       v = 2. * (array_ops.zeros([128, 128]) + x)
     with ops.device(test.gpu_device_name()):
       stager = data_flow_ops.StagingArea([dtypes.float32])
       stage = stager.put([v])
       y = stager.get()
       y = math_ops.reduce_max(math_ops.matmul(y, y))
     sess.run(stage, feed_dict={x: -1})
     for i in range(10):
       _, yval = sess.run([stage, y], feed_dict={x: i})
       self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
Пример #23
0
    def _build_image_processing(self, shift_ratio=0):
        """"Build the image (pre)processing portion of the model graph."""
        if self.use_synthetic_gpu_images:
            return (None, None)

        with tf.device('/cpu:0'):
            if self.params.eval:
                subset = 'validation'
            else:
                subset = 'train'
            image_producer_ops = []
            images_splits, labels_splits = self.image_preprocessor.minibatch(
                self.dataset,
                subset=subset,
                use_datasets=self.params.use_datasets,
                cache_data=self.params.cache_data,
                shift_ratio=shift_ratio)
            images_shape = images_splits[0].get_shape()
            labels_shape = labels_splits[0].get_shape()

        with tf.device('/gpu:0'):
            if self.params.eval:
                image_producer_stage = data_flow_ops.StagingArea(
                    [images_splits[0].dtype, labels_splits[0].dtype],
                    shapes=[images_shape, labels_shape],
                    capacity=1)
            else:
                image_producer_stage = data_flow_ops.StagingArea(
                    [images_splits[0].dtype, labels_splits[0].dtype],
                    shapes=[images_shape, labels_shape],
                    capacity=self.batch_group_size)

            put_op = image_producer_stage.put(
                [images_splits[0], labels_splits[0]])
            image_producer_ops.append(put_op)
        return (image_producer_ops, image_producer_stage)
Пример #24
0
    def noise_queue(self, shuffle=True):
        with tf.device('/cpu:0'):
            noise_filename_queue, noise_filequeue_enqueue_op = self.string_input_queue(
                self.noise_filename, shuffle=shuffle, capacity=16384)

            audio, seq_len = self.noise_filequeue_reader(noise_filename_queue)

        stager = data_flow_ops.StagingArea([tf.float32, tf.int32],
                                           shapes=[(self.config.batch_size,
                                                    None, self.last_dim),
                                                   (self.config.batch_size)])

        stage_op = stager.put((audio, seq_len))

        return stager, stage_op, noise_filequeue_enqueue_op
Пример #25
0
  def testPeek(self):
    with ops.device('/cpu:0'):
      x = array_ops.placeholder(dtypes.int32, name='x')
      p = array_ops.placeholder(dtypes.int32, name='p')
    with ops.device(test.gpu_device_name()):
      stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]])
      stage = stager.put([x])
      peek = stager.peek(p)
      ret = stager.get()

    with self.test_session(use_gpu=True) as sess:
      for i in range(10):
        sess.run(stage, feed_dict={x:i})

      for i in range(10):
        self.assertTrue(sess.run(peek, feed_dict={p:i}) == i)
Пример #26
0
def _stage(tensors):
    """Stages the given tensors in a StagingArea for asynchronous put/get.
    """
    stage_area = data_flow_ops.StagingArea(
        dtypes=[tensor.dtype for tensor in tensors],
        shapes=[tensor.get_shape() for tensor in tensors])
    put_op = stage_area.put(tensors)
    get_tensors = stage_area.get()
    tf.add_to_collection('STAGING_AREA_PUTS', put_op)
    # This is a WAR for shape inference not working through Stage ops in the
    # backend, which prevents optimization of Sum ops in the bwd pass, which in
    # turn prevents the auto_mixed_precision pass from optimizing several ops.
    get_tensors = [
        tf.reshape(get_tensor, shape_for_reshape(tensor.get_shape()))
        for (tensor, get_tensor) in zip(tensors, get_tensors)
    ]
    return put_op, get_tensors
Пример #27
0
    def testColocation(self):
        gpu_dev = test.gpu_device_name()

        with ops.Graph().as_default() as G:
            with ops.device('/cpu:0'):
                x = array_ops.placeholder(dtypes.float32)
                v = 2. * (array_ops.zeros([128, 128]) + x)
            with ops.device(gpu_dev):
                stager = data_flow_ops.StagingArea([dtypes.float32])
                y = stager.put([v])
                expected_name = gpu_dev if 'gpu' not in gpu_dev else '/device:GPU:0'
                self.assertEqual(y.device, expected_name)
            with ops.device('/cpu:0'):
                x = stager.get()[0]
                self.assertEqual(x.device, '/device:CPU:0')

        G.finalize()
Пример #28
0
    def stage(tensors):
        """
        Stages the given tensors in a StagingArea for asynchronous put/get
        :param tensors: tf.Tensor
        :return: get and put tf.Op operations.
        """
        staging_area = data_flow_ops.StagingArea(
            dtypes=[tensor.dtype for tensor in tensors],
            shapes=[tensor.get_shape() for tensor in tensors])
        load_op = staging_area.put(tensors)
        get_tensors = staging_area.get()

        get_tensors = [
            tf.reshape(get_t, t.get_shape())
            for (get_t, t) in zip(get_tensors, tensors)
        ]
        return load_op, get_tensors
Пример #29
0
    def __call__(self, getter, name, *args, **kwargs):
        name_split = name.split('/', 2)
        device_index = int(name_split[1].split('_')[1])
        name_without_tower = name_split[0] + '/' + name_split[2]

        if (name_without_tower in self._global_variable):
            global_var = self._global_variable[name_without_tower]
        else:
            if (self._param_server_device == self._cpu_device):
                with tf.device(self._cpu_device):
                    global_var = getter(name_without_tower, *args, **kwargs)
            else:
                min_size_device, _ = min(enumerate(self._local_sizes),
                                         key=operator.itemgetter(1))
                with tf.device(self._gpu_devices[min_size_device]):
                    global_var = getter(name_without_tower, *args, **kwargs)
                self._local_sizes[min_size_device] += global_var.get_shape(
                ).num_elements()
            self._global_variable[name_without_tower] = global_var

        if self._use_staging:
            shape = kwargs['shape']
            dtype = kwargs['dtype']
            # with tf.name_scope("Benchmark_Net/Input_Staging/Staging"):
            #     staging_var = data_flow_ops.StagingArea([dtype], [shape])
            #     put_op = staging_var.put(tf.identity(global_var))
            #     get_op = staging_var.get()[0]
            #     self._staging_put_ops.append(put_op)
            #     self._local_variable[device_index][name_without_tower] = get_op
            with tf.name_scope("Benchmark_Net/Input_Staging/Staging"):
                staging_var = data_flow_ops.StagingArea([dtype], [shape])
                # Tensor object has no attribute 'assign_sub'
                if ('moving' in name_without_tower):
                    self._local_variable[device_index][
                        name_without_tower] = global_var
                else:
                    put_op = staging_var.put(tf.identity(global_var))
                    get_op = staging_var.get()[0]
                    self._staging_put_ops.append(put_op)
                    self._local_variable[device_index][
                        name_without_tower] = get_op
        else:
            self._local_variable[device_index][name_without_tower] = global_var

        return self._local_variable[device_index][name_without_tower]
Пример #30
0
  def testMultiple(self):
    with ops.Graph().as_default() as G:
      with ops.device('/cpu:0'):
        x = array_ops.placeholder(dtypes.float32)
        v = 2. * (array_ops.zeros([128, 128]) + x)
      with ops.device(test.gpu_device_name()):
        stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32])
        stage = stager.put([x, v])
        z, y = stager.get()
        y = math_ops.reduce_max(z * math_ops.matmul(y, y))

    G.finalize()

    with self.session(graph=G) as sess:
      sess.run(stage, feed_dict={x: -1})
      for i in range(10):
        _, yval = sess.run([stage, y], feed_dict={x: i})
        self.assertAllClose(
            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)