Exemplo n.º 1
0
def inference_gru_block_vs_gru_cell(batch_size,
                                    cell_size,
                                    input_size,
                                    time_steps,
                                    use_gpu=False,
                                    iters=30):
    """Benchmark inference speed between GRUBlockCell vs GRUCell."""
    ops.reset_default_graph()
    with session.Session(graph=ops.Graph()) as sess:
        with benchmarking.device(use_gpu):

            # Random initializers.
            seed = 1994
            initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
            np.random.seed(seed)

            # Inputs
            concat_x = vs.get_variable("concat_x",
                                       [time_steps, batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                cell = rnn_cell.GRUCell(cell_size)
                outputs_dynamic, _ = rnn.dynamic_rnn(cell,
                                                     inputs=concat_x,
                                                     initial_state=h,
                                                     time_major=True,
                                                     dtype=dtypes.float32)
                sess.run([variables.global_variables_initializer()])
                basic_time_inference = benchmarking.seconds_per_run(
                    outputs_dynamic, sess, iters)

            # Output from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                cell = gru_ops.GRUBlockCell(cell_size)
                outputs_dynamic, _ = rnn.dynamic_rnn(cell,
                                                     inputs=concat_x,
                                                     initial_state=h,
                                                     time_major=True,
                                                     dtype=dtypes.float32)
                sess.run([variables.global_variables_initializer()])
                block_time_inference = benchmarking.seconds_per_run(
                    outputs_dynamic, sess, iters)

        performance_inference = (basic_time_inference - block_time_inference
                                 ) * 100 / basic_time_inference
        print(",".join([
            str(batch_size),
            str(cell_size),
            str(input_size),
            str(time_steps),
            str(use_gpu),
            str(basic_time_inference),
            str(block_time_inference),
            str(performance_inference)
        ]))

        return basic_time_inference, block_time_inference
Exemplo n.º 2
0
def inference_gru_block_vs_gru_cell(batch_size,
                                    cell_size,
                                    input_size,
                                    time_steps,
                                    use_gpu=False,
                                    iters=30):
  """Benchmark inference speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    with benchmarking.device(use_gpu):

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = vs.get_variable("concat_x",
                                 [time_steps, batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = rnn_cell.GRUCell(cell_size)
        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        basic_time_inference = benchmarking.seconds_per_run(
            outputs_dynamic, sess, iters)

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)
        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        block_time_inference = benchmarking.seconds_per_run(
            outputs_dynamic, sess, iters)

    performance_inference = (basic_time_inference - block_time_inference
                            ) * 100 / basic_time_inference
    print(",".join([
        str(batch_size), str(cell_size), str(input_size), str(time_steps), str(
            use_gpu), str(basic_time_inference), str(block_time_inference), str(
                performance_inference)
    ]))

    return basic_time_inference, block_time_inference
Exemplo n.º 3
0
  def benchmarkLSTMBlockCellBpropWithDynamicRNN(self):
    print("BlockLSTMCell backward propagation via dynamic_rnn().")
    print("--------------------------------------------------------------")
    print("LSTMBlockCell Seconds per inference.")
    print("batch_size,cell_size,input_size,time_steps,use_gpu,wall_time")
    iters = 10
    for config in benchmarking.dict_product({
        "batch_size": [1, 8, 13, 32, 67, 128],
        "cell_size": [128, 250, 512, 650, 1024, 1350],
        "time_steps": [40],
        "use_gpu": [True, False]
    }):
      with ops.Graph().as_default():
        with benchmarking.device(use_gpu=config["use_gpu"]):
          time_steps = config["time_steps"]
          batch_size = config["batch_size"]
          cell_size = input_size = config["cell_size"]
          inputs = variable_scope.get_variable(
              "x", [time_steps, batch_size, cell_size],
              trainable=False,
              dtype=dtypes.float32)
          with variable_scope.variable_scope(
              "rnn", reuse=variable_scope.AUTO_REUSE):
            w = variable_scope.get_variable(
                "rnn/lstm_cell/kernel",
                shape=[input_size + cell_size, cell_size * 4],
                dtype=dtypes.float32)
            b = variable_scope.get_variable(
                "rnn/lstm_cell/bias",
                shape=[cell_size * 4],
                dtype=dtypes.float32,
                initializer=init_ops.zeros_initializer())
            cell = lstm_ops.LSTMBlockCell(cell_size)
            outputs = rnn.dynamic_rnn(
                cell, inputs, time_major=True, dtype=dtypes.float32)
          grads = gradients_impl.gradients(outputs, [inputs, w, b])
          init_op = variables.global_variables_initializer()

        with session.Session() as sess:
          sess.run(init_op)
          wall_time = benchmarking.seconds_per_run(grads, sess, iters)

        # Print to stdout. If the TEST_REPORT_FILE_PREFIX environment variable
        # is set, this will produce a copy-paste-able CSV file.
        print(",".join(
            map(str, [
                batch_size, cell_size, cell_size, time_steps, config["use_gpu"],
                wall_time
            ])))
        benchmark_name_template = "_".join([
            "LSTMBlockCell_bprop", "BS%(batch_size)i", "CS%(cell_size)i",
            "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s"
        ])

        self.report_benchmark(
            name=benchmark_name_template % config,
            iters=iters,
            wall_time=wall_time,
            extras=config)
Exemplo n.º 4
0
  def benchmarkLSTMBlockCellBpropWithDynamicRNN(self):
    print("BlockLSTMCell backward propagation via dynamic_rnn().")
    print("--------------------------------------------------------------")
    print("LSTMBlockCell Seconds per inference.")
    print("batch_size,cell_size,input_size,time_steps,use_gpu,wall_time")
    iters = 10
    for config in benchmarking.dict_product({
        "batch_size": [1, 8, 13, 32, 67, 128],
        "cell_size": [128, 250, 512, 650, 1024, 1350],
        "time_steps": [40],
        "use_gpu": [True, False]
    }):
      with ops.Graph().as_default():
        with benchmarking.device(use_gpu=config["use_gpu"]):
          time_steps = config["time_steps"]
          batch_size = config["batch_size"]
          cell_size = input_size = config["cell_size"]
          inputs = variable_scope.get_variable(
              "x", [time_steps, batch_size, cell_size],
              trainable=False,
              dtype=dtypes.float32)
          with variable_scope.variable_scope(
              "rnn", reuse=variable_scope.AUTO_REUSE):
            w = variable_scope.get_variable(
                "rnn/lstm_cell/kernel",
                shape=[input_size + cell_size, cell_size * 4],
                dtype=dtypes.float32)
            b = variable_scope.get_variable(
                "rnn/lstm_cell/bias",
                shape=[cell_size * 4],
                dtype=dtypes.float32,
                initializer=init_ops.zeros_initializer())
            cell = lstm_ops.LSTMBlockCell(cell_size)
            outputs = rnn.dynamic_rnn(
                cell, inputs, time_major=True, dtype=dtypes.float32)
          grads = gradients_impl.gradients(outputs, [inputs, w, b])
          init_op = variables.global_variables_initializer()

        with session.Session() as sess:
          sess.run(init_op)
          wall_time = benchmarking.seconds_per_run(grads, sess, iters)

        # Print to stdout. If the TEST_REPORT_FILE_PREFIX environment variable
        # is set, this will produce a copy-paste-able CSV file.
        print(",".join(
            map(str, [
                batch_size, cell_size, cell_size, time_steps, config["use_gpu"],
                wall_time
            ])))
        benchmark_name_template = "_".join([
            "LSTMBlockCell_bprop", "BS%(batch_size)i", "CS%(cell_size)i",
            "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s"
        ])

        self.report_benchmark(
            name=benchmark_name_template % config,
            iters=iters,
            wall_time=wall_time,
            extras=config)
Exemplo n.º 5
0
def single_bprop_step_gru_block_vs_gru_cell(batch_size,
                                            cell_size,
                                            input_size,
                                            use_gpu=False,
                                            iters=30):
    """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
    ops.reset_default_graph()
    with session.Session(graph=ops.Graph()) as sess:
        with benchmarking.device(use_gpu):
            initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
            # Inputs
            x = vs.get_variable("x", [batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x),
                                                     array_ops.identity(h))
                sess.run([variables.global_variables_initializer()])
                grad_output_wrt_input = gradients_impl.gradients([output], h)
                basic_time_bprop = benchmarking.seconds_per_run(
                    grad_output_wrt_input, sess, iters)

            # Output from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x),
                                                         array_ops.identity(h))
                sess.run([variables.global_variables_initializer()])
                grad_output_wrt_input = gradients_impl.gradients([output], h)
                block_time_bprop = benchmarking.seconds_per_run(
                    grad_output_wrt_input, sess, iters)

    performance_inference = (basic_time_bprop -
                             block_time_bprop) * 100 / basic_time_bprop

    print(",".join([
        str(batch_size),
        str(cell_size),
        str(input_size),
        str(use_gpu),
        str(basic_time_bprop),
        str(block_time_bprop),
        str(performance_inference)
    ]))

    return basic_time_bprop, block_time_bprop
Exemplo n.º 6
0
    def benchmarkLSTMBlockCellFpropWithDynamicRNN(self):
        print("BlockLSTMCell forward propagation via dynamic_rnn().")
        print("--------------------------------------------------------------")
        print("LSTMBlockCell Seconds per inference.")
        print("batch_size,cell_size,input_size,time_steps,use_gpu,wall_time")
        iters = 10
        for config in benchmarking.dict_product({
                "batch_size": [1, 8, 13, 32, 67, 128],
                "cell_size": [128, 250, 512, 650, 1024, 1350],
                "time_steps": [40],
                "use_gpu": [True, False],
                "dtype": ["float32", "float16"],
        }):
            dtype = dtypes.float32 if config[
                "dtype"] == "float32" else dtypes.float16
            with ops.Graph().as_default():
                with benchmarking.device(use_gpu=config["use_gpu"]):
                    inputs = variable_scope.get_variable(
                        "x",
                        dtype=dtype,
                        shape=[
                            config["time_steps"], config["batch_size"],
                            config["cell_size"]
                        ])
                    cell = lstm_ops.LSTMBlockCell(config["cell_size"],
                                                  dtype=dtype)
                    outputs = rnn.dynamic_rnn(cell,
                                              inputs,
                                              time_major=True,
                                              dtype=dtype)
                    init_op = variables.global_variables_initializer()

                with session.Session() as sess:
                    sess.run(init_op)
                    wall_time = benchmarking.seconds_per_run(
                        outputs, sess, iters)

                # Print to stdout. If the TEST_REPORT_FILE_PREFIX environment variable
                # is set, this will produce a copy-paste-able CSV file.
                print(",".join(
                    map(str, [
                        config["dtype"], config["batch_size"],
                        config["cell_size"], config["cell_size"],
                        config["time_steps"], config["use_gpu"], wall_time
                    ])))
                benchmark_name_template = "_".join([
                    "LSTMBlockCell_fprop", "DT_%(dtype)s", "BS%(batch_size)i",
                    "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i",
                    "gpu_%(use_gpu)s"
                ])

                self.report_benchmark(name=benchmark_name_template % config,
                                      iters=iters,
                                      wall_time=wall_time,
                                      extras=config)
Exemplo n.º 7
0
def single_bprop_step_gru_block_vs_gru_cell(batch_size,
                                            cell_size,
                                            input_size,
                                            use_gpu=False,
                                            iters=30):
  """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    with benchmarking.device(use_gpu):
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
      # Inputs
      x = vs.get_variable("x", [batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x),
                                             array_ops.identity(h))
        sess.run([variables.global_variables_initializer()])
        grad_output_wrt_input = gradients_impl.gradients([output], h)
        basic_time_bprop = benchmarking.seconds_per_run(grad_output_wrt_input,
                                                        sess, iters)

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x),
                                                 array_ops.identity(h))
        sess.run([variables.global_variables_initializer()])
        grad_output_wrt_input = gradients_impl.gradients([output], h)
        block_time_bprop = benchmarking.seconds_per_run(grad_output_wrt_input,
                                                        sess, iters)

  performance_inference = (
      basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop

  print(",".join([
      str(batch_size), str(cell_size), str(input_size), str(use_gpu), str(
          basic_time_bprop), str(block_time_bprop), str(performance_inference)
  ]))

  return basic_time_bprop, block_time_bprop
Exemplo n.º 8
0
  def benchmarkLSTMBlockCellFpropWithDynamicRNN(self):
    print("BlockLSTMCell forward propagation via dynamic_rnn().")
    print("--------------------------------------------------------------")
    print("LSTMBlockCell Seconds per inference.")
    print("batch_size,cell_size,input_size,time_steps,use_gpu,wall_time")
    iters = 10
    for config in benchmarking.dict_product({
        "batch_size": [1, 8, 13, 32, 67, 128],
        "cell_size": [128, 250, 512, 650, 1024, 1350],
        "time_steps": [40],
        "use_gpu": [True, False],
        "dtype": ["float32", "float16"],
    }):
      dtype = dtypes.float32 if config["dtype"] == "float32" else dtypes.float16
      with ops.Graph().as_default():
        with benchmarking.device(use_gpu=config["use_gpu"]):
          inputs = variable_scope.get_variable(
              "x",
              dtype=dtype,
              shape=[
                  config["time_steps"], config["batch_size"],
                  config["cell_size"]
              ])
          cell = lstm_ops.LSTMBlockCell(config["cell_size"], dtype=dtype)
          outputs = rnn.dynamic_rnn(cell, inputs, time_major=True, dtype=dtype)
          init_op = variables.global_variables_initializer()

        with session.Session() as sess:
          sess.run(init_op)
          wall_time = benchmarking.seconds_per_run(outputs, sess, iters)

        # Print to stdout. If the TEST_REPORT_FILE_PREFIX environment variable
        # is set, this will produce a copy-paste-able CSV file.
        print(",".join(
            map(str, [
                config["dtype"], config["batch_size"], config["cell_size"],
                config["cell_size"], config["time_steps"], config["use_gpu"],
                wall_time
            ])))
        benchmark_name_template = "_".join([
            "LSTMBlockCell_fprop", "DT_%(dtype)s", "BS%(batch_size)i",
            "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i",
            "gpu_%(use_gpu)s"
        ])

        self.report_benchmark(
            name=benchmark_name_template % config,
            iters=iters,
            wall_time=wall_time,
            extras=config)
Exemplo n.º 9
0
def training_gru_block_vs_gru_cell(batch_size,
                                   cell_size,
                                   input_size,
                                   time_steps,
                                   use_gpu=False,
                                   iters=30):
  """Benchmark training speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    # Specify the device which is been used.
    with benchmarking.device(use_gpu):

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = vs.get_variable("concat_x",
                                 [time_steps, batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])
      y = vs.get_variable("y", [time_steps, batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = rnn_cell.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        basic_time_training = benchmarking.seconds_per_run(
            optimizer, sess, iters)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        block_time_training = benchmarking.seconds_per_run(
            optimizer, sess, iters)

    performance_training = (
        basic_time_training - block_time_training) * 100 / basic_time_training

    print(",".join([
        str(batch_size), str(cell_size), str(input_size), str(time_steps), str(
            use_gpu), str(basic_time_training), str(block_time_training), str(
                performance_training)
    ]))

    return basic_time_training, block_time_training