Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--hidden_size", type=int, default=4096)
    parser.add_argument("--timesteps", type=int, default=256)
    parser.add_argument("--max_examples", type=int, default=2048)
    parser.add_argument("--path", type=str, default=join(DATA_DIR, "amazon_reviews", "reviews_Movies_and_TV_5.json"))
    add_bool_flag(parser, "xla", False)
    args = parser.parse_args()
    hps = HParams(
        nhidden=args.hidden_size,
        nembd=64,
        nbatch=args.batch_size,
        nstates=2,
        nvocab=256,
        out_wn=False,
        rnn_wn=True,
        rnn_type='mlstm',
        embd_wn=True,
    )

    def build_model(x, y):
        cells, states, logits = model(hps, x, reuse=False)
        loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=y)
        mean_loss = tf.reduce_mean(loss)
        train_op = tf.train.GradientDescentOptimizer(0.01).minimize(mean_loss)
        loss = tf.reduce_sum(loss)
        with tf.control_dependencies([train_op]):
            train_op_loss = tf.identity(loss)
        return train_op_loss, loss

    X = tf.placeholder(tf.int32, [None, args.timesteps])
    Y = tf.placeholder(tf.int32, [None, args.timesteps])

    if args.xla:
        train_op, loss = xla.compile(computation=build_model, inputs=(X, Y))
    else:
        train_op, loss = build_model(X, Y)

    config = tf.ConfigProto()
    if args.xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run(session=sess)

    # load some data
    x = load_training_data(path=args.path, timesteps=args.timesteps, max_examples=args.max_examples)

    for epoch in range(args.epochs):
        t0 = time.time()
        epoch_loss = 0.0
        for i in range(0, len(x), args.batch_size):
            batch_x = x[i:i + args.batch_size, 0:-1]
            batch_y = x[i:i + args.batch_size, 1:]
            _, batch_cost = sess.run((train_op, loss), {X: batch_x, Y: batch_y})
            epoch_loss += batch_cost
        t1 = time.time()
        print("%.3f\t%.3f" % (t1 - t0, epoch_loss))
Пример #2
0
def create_and_run_graph(xla_enabled):
    # config = tf.ConfigProto()
    # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    # tf.reset_default_graph()
    with tf.Graph().as_default() as graph:
        with tf.Session(graph=graph) as sess:

            x = tf.placeholder(tf.float32, shape=(None, 256, 256, 3), name='x')
            y = tf.placeholder(tf.float32, shape=(None, 64), name='y')
            z = tf.placeholder(tf.float32, shape=(None, 64), name='z')

            if xla_enabled == True:
                result = xla.compile(computation=model_fn, inputs=(x, y, z))[0]
                # result = tf.add(result, result)

            else:
                result = model_fn(x, y, z)

            # `result` is a normal Tensor (albeit one that is computed by an XLA
            # compiled executable) and can be used like any other Tensor.

            start = time.time()
            sess.run(tf.global_variables_initializer())
            for _ in range(epoch):
                output = sess.run(
                    result, feed_dict={
                        x: x_val,
                        y: y_val,
                        z: z_val
                    }
                )  # you can add memory info by adding options=run_opts to the sess.run
            end = time.time()
            p_t = end - start
    gc.collect()
    return output, p_t
Пример #3
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=1,
        number_of_replicas=opts['replicas'] * opts['shards'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"])
    ipu.utils.configure_ipu_system(ipu_options)

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver, None)
Пример #4
0
        def get_grad_fn():
            ctx = get_current_tower_context()
            inputs = input.get_input_tensors()

            def compute_grad_from_inputs(*inputs):
                cost = get_cost_fn(*inputs)
                assert isinstance(cost, tf.Tensor), \
                    "Expect the given function to return a cost, but got {} instead".format(str(cost))
                assert cost.shape.ndims == 0, "Cost must be a scalar, but found {}!".format(
                    cost)

                if not ctx.is_training:
                    return None  # this is the tower function, could be called for inference

                if ctx.has_own_variables:
                    varlist = ctx.get_collection_in_tower(
                        tfv1.GraphKeys.TRAINABLE_VARIABLES)
                else:
                    varlist = tfv1.trainable_variables()
                opt = get_opt_fn()
                if is_tfv2() and isinstance(opt, tf.optimizers.Optimizer):
                    grads = opt.get_gradients(cost, varlist)
                    grads = list(zip(grads, varlist))
                else:
                    grads = opt.compute_gradients(
                        cost,
                        var_list=varlist,
                        gate_gradients=self.GATE_GRADIENTS,
                        colocate_gradients_with_ops=self.
                        COLOCATE_GRADIENTS_WITH_OPS,
                        aggregation_method=self.AGGREGATION_METHOD)
                grads = FilterNoneGrad().process(grads)
                return grads

            if not self.XLA_COMPILE:
                return compute_grad_from_inputs(*inputs)
            else:
                try:
                    from tensorflow.contrib.compiler import xla  # deprecated
                except ImportError:
                    from tensorflow.python.compiler.xla import xla

                def xla_func():
                    grads = compute_grad_from_inputs(*inputs)
                    # unpack, because the return value
                    # of xla function cannot have nested structure
                    grads = [x[0] for x in grads]
                    return grads

                grads_no_vars = xla.compile(xla_func)
                if ctx.has_own_variables:
                    varlist = ctx.get_collection_in_tower(
                        tf.GraphKeys.TRAINABLE_VARIABLES)
                else:
                    varlist = tf.trainable_variables()
                return list(zip(grads_no_vars, varlist))
def main():
    print("## start ##")
    if not os.path.exists(model_root_path):
        print("## create model folder ##")
        os.mkdir(model_root_path)

    if not os.path.exists(model_path):
        print("## download model ##")
        download(
            "https://s3.amazonaws.com/onnx-model-zoo/resnet/resnet50v2/resnet50v2.onnx",
            model_path, False)

    print("## load onnx model ##")
    onnx_model = onnx.load_model(model_path)

    print("## convert onnx -> tf_graph ##")
    tf_model = onnx_tf.backend.prepare(onnx_model, device="CPU")
    tf_graph = tf_model.graph
    placeholders = tf.contrib.framework.get_placeholders(tf_graph)

    print("## prepare input ##")
    x = np.reshape(
        np.arange(1 * 3 * 224 * 224, dtype="float32") * 0.5, (1, 3, 224, 224))

    print("## prepare tf model ##")
    y = tf_graph.get_tensor_by_name("add_1:0")  # for ResNet50
    feed_dict = {placeholders[0]: x}

    def create_graph(xx):
        return [y]

    print("## start session ##")
    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.device("device:XLA_CPU:0"):
        with tf.Session(graph=tf_graph, config=config) as sess:
            print("## compile xla ##")
            y_ = xla.compile(
                create_graph,
                [x])  # TODO:xla.compileしても推論速度が変わっていないため、使い方が間違っていそう・・。

            print("## start run ##")
            start_time = time.time()
            out = sess.run(y_, feed_dict=feed_dict)
            exec_time = time.time() - start_time
            print(out)
            print(exec_time)
Пример #6
0
def create_and_run_graph(xla_enabled):
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    # tf.reset_default_graph()
    with tf.Graph().as_default() as graph:
        with tf.Session(graph=graph) as sess:

            x = tf.placeholder(tf.float32, shape=(None, 256, 256, 3), name='x')
            y = tf.placeholder(tf.float32, shape=(None, 64), name='y')
            z = tf.placeholder(tf.float32, shape=(None, 64), name='z')

            if xla_enabled == True:
                result = xla.compile(computation=model_fn, inputs=(x, y, z))[0]

            else:
                result = model_fn(x, y, z)

            # `result` is a normal Tensor (albeit one that is computed by an XLA
            # compiled executable) and can be used like any other Tensor.

            sess.run(tf.global_variables_initializer())
            x_val1 = x_val.swapaxes(1, 3)
            start = time.time()
            for _ in range(epoch):
                output = sess.run(
                    result, feed_dict={
                        x: x_val1,
                        y: y_val,
                        z: z_val
                    }
                )  # you can add memory info by adding options=run_opts to the sess.run
            end = time.time()
            p_t = end - start
            # retrieve all the variables and delete them manually
            # a = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            # print(tf.get_default_graph().get_name_scope())
            # print(a)
            # for x in a:
            #     l = tf.get_variable(x.name, x.shape)
            #     print(l)
            print('maxrss: ',
                  resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        print('maxrss: ', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    print('maxrss: ', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    return output, p_t
Пример #7
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver)
Пример #8
0
def run(gemm, M, N, K, repeat=10):
    A = tf.placeholder(name="A", dtype=tf.float32, shape=(M, K))
    B = tf.placeholder(name="B", dtype=tf.float32, shape=(K, N))
    create_graph = functools.partial(gemm)
    [C] = xla.compile(create_graph, inputs=[A, B])
    # C = create_graph(A, B)
    A_np = np.random.uniform(0, 1, (M, K)).astype(np.float32)
    B_np = np.random.uniform(0, 1, (N, K)).astype(np.float32)

    # A_np = np.array([[1, 2], [3, 4]], dtype=np.float32)
    # B_np = np.array([[5, 6], [7, 8]], dtype=np.float32)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # warm-up
        C_np = sess.run(C, feed_dict={A: A_np, B: B_np})
        beg = time.time()
        for i in range(repeat):
            C_np = sess.run(C, feed_dict={A: A_np, B: B_np})
        end = time.time()

    return (end - beg) * 1e3 / repeat
Пример #9
0
def build_model(hidden_layers, use_xla):
    config = BertConfig(num_hidden_layers=hidden_layers,
                        num_attention_heads=4,
                        intermediate_size=1024,
                        hidden_size=512,
                        vocab_size=32000,
                        hidden_act="gelu")

    def build_model_fn(input_ids, token_type_ids, attention_mask, labels):
        (last_layer, ), output = bert_model(config, input_ids, token_type_ids,
                                            attention_mask, False)
        predictions = tf.contrib.layers.fully_connected(last_layer,
                                                        config.vocab_size,
                                                        activation_fn=None)
        loss = tf.losses.sparse_softmax_cross_entropy(logits=predictions,
                                                      labels=labels)
        mean_loss = tf.reduce_mean(loss)
        train_op = tf.train.GradientDescentOptimizer(0.01).minimize(mean_loss)
        loss = tf.reduce_sum(loss)
        # with tf.control_dependencies([train_op]):
        train_op_loss = tf.identity(loss)
        return train_op_loss, output, loss

    input_ids = tf.placeholder(tf.int32, [None, None])
    token_type_ids = tf.placeholder(tf.int32, [None, None])
    attention_mask = tf.placeholder(tf.int32, [None, None])
    labels = tf.placeholder(tf.int32, [None, None])

    if use_xla:
        train_op, output, loss = xla.compile(computation=build_model_fn,
                                             inputs=(input_ids, token_type_ids,
                                                     attention_mask, labels))
    else:
        train_op, output, loss = build_model_fn(input_ids, token_type_ids,
                                                attention_mask, labels)
    return BertModel(input_ids, token_type_ids, attention_mask, output, labels,
                     train_op, loss)
Пример #10
0
            weights_initializer=slim.initializers.xavier_initializer())
        predictions = slim.softmax(logits, scope="softmax")

    return predictions


if __name__ == "__main__":
    batch_size = 128
    data_shape = (batch_size, 224, 224, 3)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    trials = 400
    inputs = tf.placeholder(name="input", dtype=tf.float32, shape=data_shape)
    # preds = mobilenet("mobilenet", inputs)
    create_net = functools.partial(mobilenet, "mobilenet")
    [preds] = xla.compile(create_net, inputs=[inputs])
    inputs_np = np.random.uniform(1e9, 1e10, data_shape).astype(np.float32)

    with tf.Session(config=config) as sess:
        # writer = tf.summary.FileWriter("graph", sess.graph)

        # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        sess.run(tf.global_variables_initializer())

        # # warm up, no record
        output = sess.run(preds, feed_dict={inputs: inputs_np})

        # record
        beg = time.time()
        for i in range(trials):
            # run_metadata = tf.RunMetadata()
Пример #11
0
    def attack(self, image_clean, label, model_func):
        # target_label = self._create_random_target(label)
        target_label = label  # untargeted attack

        def fp16_getter(getter, *args, **kwargs):
            name = args[0] if len(args) else kwargs['name']
            if not name.endswith('/W') and not name.endswith('/b'):
                """
                Following convention, convolution & fc are quantized.
                BatchNorm (gamma & beta) are not quantized.
                """
                return getter(*args, **kwargs)
            else:
                if kwargs['dtype'] == tf.float16:
                    kwargs['dtype'] = tf.float32
                    ret = getter(*args, **kwargs)
                    ret = tf.cast(ret, tf.float16)
                    log_once("Variable {} casted to fp16 ...".format(name))
                    return ret
                else:
                    return getter(*args, **kwargs)

        def one_step_attack(adv):
            if not self.USE_FP16:
                logits = model_func(adv)
            else:
                adv16 = tf.cast(adv, tf.float16)
                with custom_getter_scope(fp16_getter):
                    logits = model_func(adv16)
                    logits = tf.cast(logits, tf.float32)
            # Note we don't add any summaries here when creating losses, because
            # summaries don't work in conditionals.
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=target_label
            )  # we want to minimize it in targeted attack
            if not self.USE_FP16:
                g, = tf.gradients(losses, adv)
            else:
                """
                We perform loss scaling to prevent underflow:
                https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                (We have not yet tried training without scaling)
                """
                g, = tf.gradients(losses * 128., adv)
                g = g / 128.
            """
            Feature Denoising, Sec 5:
            We use the Projected Gradient Descent (PGD)
            (implemented at https://github.com/MadryLab/cifar10_challenge )
            as the white-box attacker for adversarial training
            """
            adv = tf.clip_by_value(adv + tf.sign(g) * self.step_size,
                                   lower_bound, upper_bound)
            return adv

        """
        Feature Denoising, Sec 6:
        Adversarial perturbation is considered under L∞ norm (i.e., maximum difference for each pixel).
        """
        lower_bound = tf.clip_by_value(image_clean - self.epsilon, -1., 1.)
        upper_bound = tf.clip_by_value(image_clean + self.epsilon, -1., 1.)
        """
        Feature Denoising Sec. 5:
        We randomly choose from both initializations in the
        PGD attacker during adversarial training: 20% of training
        batches use clean images to initialize PGD, and 80% use
        random points within the allowed .
        """
        init_start = tf.random_uniform(tf.shape(image_clean),
                                       minval=-self.epsilon,
                                       maxval=self.epsilon)
        start_from_noise_index = tf.cast(
            tf.greater(tf.random_uniform(shape=[]),
                       self.prob_start_from_clean), tf.float32)
        start_adv = image_clean + start_from_noise_index * init_start

        if self.USE_XLA:
            assert tuple(map(int, tf.__version__.split('.')[:2])) >= (1, 12)
            from tensorflow.contrib.compiler import xla
        with tf.name_scope('attack_loop'):
            adv_final = tf.while_loop(
                lambda _: True,
                one_step_attack if not self.USE_XLA else
                lambda adv: xla.compile(lambda: one_step_attack(adv))[0],
                [start_adv],
                back_prop=False,
                maximum_iterations=self.num_iter,
                parallel_iterations=1)
        return adv_final, target_label
Пример #12
0
        logits = tf.keras.layers.Dense(units=LABELS)(net)
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                    logits=logits))
        global_step = tf.train.get_or_create_global_step()
        boundaries = [100000, 110000]
        values = [1.0, 0.5, 0.1]
        learning_rate = tf.train.piecewise_constant_decay(
            global_step, boundaries, values)
        train_step = tf.train.GradientDescentOptimizer(
            learning_rate, name="final_node").minimize(cross_entropy)
        with tf.control_dependencies([train_step]):
            return tf.identity(cross_entropy, name="results")


x = tf.placeholder(tf.float32, [BATCH_SIZE, FEAT_DIM], name='x')
y = tf.placeholder(tf.float32, [BATCH_SIZE, LABELS], name='y')

if xla_flag:
    (xla_loss, ) = compile(model_fn, [x, y])
else:
    xla_loss = model_fn(x, y)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    ans = sess.run(xla_loss,
                   feed_dict={
                       x: np.ones((BATCH_SIZE, FEAT_DIM)),
                       y: np.ones((BATCH_SIZE, LABELS))
                   })
Пример #13
0
                                           train_ds.output_shapes)
images, labels = iterator.get_next()
images = tf.reshape(images, [-1, IMAGE_SIZE])
images, labels = tf.cast(images, tf.float32), tf.cast(labels, tf.int64)


def build_mnist_model(x, y_):
    y = tf.keras.layers.Dense(NUM_CLASSES).apply(x)

    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

    return y, train_step


[y] = xla.compile(build_mnist_model, inputs=[images, labels])

# Ask Tensorflow to execute code in XLA-friendly manner

y, train_step = build_mnist_model(images, labels)
with tf.control_dependencies([train_step]):
    y = tf.identity(y)

# Ask Tensorflow to STOP executing code in XLA-friendly manner

# Creates session and initialize all variables.
# xla.compile() doesn't work with Keras model.fit() API or TF eager mode yet.
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Feeds training dataset
Пример #14
0
# To test effect of bootstrap_results in run_single_steps(), run bootstrap_results in isolation
def test_bootstrap_results():
    def _step(i, state):
        new_state = hmc.bootstrap_results(state).proposed_state
        return [i + 1, new_state]

    _, s = tf.while_loop(cond=lambda i, _: i < N_STEPS_PER_REPEAT,
                         body=_step,
                         loop_vars=[tf.constant(0), 1.])

    return s


if __name__ == '__main__':
    with scope(device):
        ss = xla.compile(run_single_steps, ())
        # br = xla.compile(test_bootstrap_results, ())

    conf = tf.ConfigProto(log_device_placement=True)
    sess = tf.Session(config=conf)
    sess.run(tf.global_variables_initializer())

    # Run once to compile
    sess.run(ss)
    # sess.run(br)

    t_total = 0.
    t_total_br = 0.

    print('Running HMC.')
    for itr in range(N_REPEATS):
Пример #15
0
        '--save-graph',
        action="store_true",
        help="Save default graph to 'logs' directory (used by TensorBoard)")
    parser.add_argument('--report',
                        action="store_true",
                        help="Save execution and compilation reports as JSON")
    options = parser.parse_args()

    # Initialize the HMC transition kernel.
    hmc = tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=unnormalized_log_prob,
        num_leapfrog_steps=options.leapfrog_steps,
        step_size=1.)

    with ipu.scopes.ipu_scope('/device:IPU:0'):
        ss = xla.compile(lambda: run_single_steps(hmc, options.hmc_steps), ())

    # Report
    report = gen_ipu_ops.ipu_event_trace()

    # Dump the graph to a logdir
    if options.save_graph:
        writer = tf.summary.FileWriter(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs',
                         time.strftime('%Y%m%d_%H%M%S_%Z')))
        writer.add_graph(tf.get_default_graph())

    config = utils.create_ipu_config(profiling=options.report,
                                     profile_execution=options.report,
                                     report_every_nth_execution=1,
                                     max_report_size=0x100000000)
Пример #16
0
def maybe_xla_compile(hparams, fn, *args):
    pure_fn = lambda: fn(*args)
    if hparams and hparams.xla_compile:
        return xla.compile(pure_fn)
    else:
        return pure_fn()
Пример #17
0
    def batch_all_reduce(self,
                         all_device_tensors,
                         num_splits,
                         compact_tensors,
                         defer_tensors,
                         xla_compile=False):
        """Performs a batch all-reduce.

    The reduction done is a sum.

    `all_device_tensors` is a list of list of tensors that will be batch
    all-reduced. All tensors within a single inner list must be on the same
    device. The nth element in each list, for any n, will be reduced together.
    The return value is in the same form as `all_device_tensors`, except that
    each tensor is reduced.

    For example, if `all_device_tensors` is:
    [[ A,  B  ],     # A and B are on GPU 0
     [ C,  D  ]]     # C and D are on GPU 1

    Then the return value will be:
    [[ A+C,  B+D ],  # These two tensors are on GPU 0
     [ A+C,  B+D ]]  # These two tensors are on GPU 1

    Arguments:
      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
        is a tensor where `i` is the device index and `j` is the tensor index.
      num_splits: If not None, tensors will be concatenated and split into this
        many pieces during the all-reduce, then split back into their original
        shapes afterwards. Has no impact on correctness and can improve
        performance. Requires all tensors to be the same type.
      compact_tensors: If True, tensors are casted to fp16 before being all-
        reduced. Improves performance, but hurts numerical stability.
      defer_tensors: If True, every time the return value
        `reduced_all_device_tensors` is evaluated, the result will be the
        reduced tensors values of `all_device_tensors` from the previous session
        run instead of the current session run, or zero on the first session
        run. This can improve performance. When training neural networks,
        deferring gradients often does not harm training, so this can be used to
        improve performance.
      xla_compile: If True, use XLA to compile gradients packing and unpacking
        ops.

    Returns:
      reduced_all_device_tensors: A list in the same form as
        `all_device_tensors`, except each tensor has been reduced.
      warmup_ops: A list of ops needed to be run once before the all-reduce can
        occur.
    """

        # Before all-reducing tensors, we do several preprocessing functions that
        # can speed up the all-reduce. We undo these functions after all-reducing
        # the tensors.

        # all_device_packed_tensors is a 2-d list of tensors indexed by
        # [device_id][tensor_id], holding packed tensors from all devices involved
        # in all-reduce.
        all_device_packed_tensors = []

        # all_device_warmup_ops is a 2-d list of ops indexed by
        # [device_id][tensor_id], holding warmup_ops that need to be run once before
        # all-reduce can occur.
        all_device_warmup_ops = []

        # all_device_put_ops is a 2-d list of ops indexed by
        # [device_id][tensor_id], holding put ops for deferred tensors. They will be
        # called in each all-reduce step automatically due to control dependency.
        all_device_put_ops = []

        # packers is a list of _TensorPacker, one for each device involved in
        # all-reduce.
        packers = [
            _TensorPacker(num_splits, compact_tensors)
            for _ in all_device_tensors
        ]

        for packer, device_tensors in zip(packers, all_device_tensors):

            def pack_single_device_tensors(packer=packer,
                                           device_tensors=device_tensors):
                """Pack gradient tensors of a device."""
                packed_tensors = packer.maybe_concat_tensors(device_tensors)
                packed_tensors = packer.maybe_compact_tensors(packed_tensors)
                # When xla_compile=False, defer tensors after concat for better
                # performance.
                if defer_tensors and not xla_compile:
                    packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
                        packed_tensors)
                    all_device_put_ops.append(put_ops)
                    all_device_warmup_ops.append(warmup_ops)
                packed_tensors = packer.maybe_split_tensors(packed_tensors)
                return packed_tensors

            with tf.device(device_tensors[0].device):
                if xla_compile:
                    packed_tensors = xla.compile(pack_single_device_tensors)
                    # When xla_compile=True, intermediate tensors in packing process are
                    # not materialized. Thus, we defer tensors after packing process is
                    # completed instead of in the middle of it.
                    if defer_tensors:
                        packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
                            packed_tensors)
                        all_device_put_ops.append(put_ops)
                        all_device_warmup_ops.append(warmup_ops)
                else:
                    packed_tensors = pack_single_device_tensors()

            all_device_packed_tensors.append(packed_tensors)

        # Perform all-reduce on packed tensors.
        all_device_tensors = self._do_batch_all_reduce(
            all_device_packed_tensors)

        all_device_unpacked_tensors = []
        for packer, device_tensors in zip(packers, all_device_tensors):

            def unpack_single_device_tensors(packer=packer,
                                             device_tensors=device_tensors):
                """Unpack gradient tensors of a device."""
                unpacked_tensors = packer.undo_maybe_split_tensors(
                    device_tensors)
                unpacked_tensors = packer.undo_maybe_compact_tensors(
                    unpacked_tensors)
                unpacked_tensors = packer.undo_maybe_concat_tensors(
                    unpacked_tensors)
                return unpacked_tensors

            with tf.device(device_tensors[0].device):
                if xla_compile:
                    unpacked_device_tensor = xla.compile(
                        unpack_single_device_tensors)
                else:
                    unpacked_device_tensor = unpack_single_device_tensors()

            all_device_unpacked_tensors.append(unpacked_device_tensor)

        # Note: There is no undo operation for deferring tensors. But we do need to
        # call _add_put_op_control_deps at the end if we deferred the tensors.
        if defer_tensors:
            all_device_unpacked_tensors = _add_put_op_control_deps(
                all_device_unpacked_tensors, num_splits, all_device_put_ops)

        return all_device_unpacked_tensors, all_device_warmup_ops
Пример #18
0
        def get_grad_fn():
            ctx = get_current_tower_context()
            inputs = input.get_input_tensors()

            def compute_grad_from_inputs(*inputs):
                cost = get_cost_fn(*inputs)
                assert isinstance(cost, tf.Tensor), cost
                assert cost.shape.ndims == 0, "Cost must be a scalar, but found {}!".format(cost)

                if not ctx.is_training:
                    return None     # this is the tower function, could be called for inference

                if ctx.has_own_variables:
                    varlist = ctx.get_collection_in_tower(tf.GraphKeys.TRAINABLE_VARIABLES)
                else:
                    varlist = tf.trainable_variables()

                if os.getenv("TENSORPACK_FP16"):

                    loss_scale = 1024.0

                    if os.getenv("CUSTOM_LOSS_SCALE"):
                        loss_scale = float(os.getenv("CUSTOM_LOSS_SCALE"))

                    print(f'TENSORPACK_FP16 set. Using FP16 loss scaling of {loss_scale}')
                    cost *= loss_scale

                opt = get_opt_fn()
                grads = opt.compute_gradients(
                    cost, var_list=varlist,
                    gate_gradients=self.GATE_GRADIENTS,
                    colocate_gradients_with_ops=self.COLOCATE_GRADIENTS_WITH_OPS,
                    aggregation_method=self.AGGREGATION_METHOD)
                grads = FilterNoneGrad().process(grads)

                if os.getenv("TENSORPACK_FP16"):
                    grads = [(g * 1.0 / loss_scale, v) for g, v in grads]

                if os.getenv("TENSORPACK_SUMMARY_GRADIENT"):
                    grads = SummaryGradient().process(grads)

                if os.getenv("TENSORPACK_FREEZE_VARS"):
                    grads = [ (g - g, v) for g, v in grads ]

                return grads

            if not self.XLA_COMPILE:
                return compute_grad_from_inputs(*inputs)
            else:
                from tensorflow.contrib.compiler import xla

                def xla_func():
                    grads = compute_grad_from_inputs(*inputs)
                    # unpack, because the return value
                    # of xla function cannot have nested structure
                    grads = [x[0] for x in grads]
                    return grads

                grads_no_vars = xla.compile(xla_func)
                if ctx.has_own_variables:
                    varlist = ctx.get_collection_in_tower(tf.GraphKeys.TRAINABLE_VARIABLES)
                else:
                    varlist = tf.trainable_variables()
                return list(zip(grads_no_vars, varlist))
Пример #19
0
    def call(self, inputs):

        def get_kernel_fn(dau_w, dau_mu1, dau_mu2, dau_sigma, max_kernel_size, mu_learning_rate_factor=1):

            # add mu1/mu2 gradient multiplyer
            if mu_learning_rate_factor != 1:
                dau_mu1 = mu_learning_rate_factor * dau_mu1 + (1 - mu_learning_rate_factor) * tf.stop_gradient(dau_mu1)
                dau_mu2 = mu_learning_rate_factor * dau_mu2 + (1 - mu_learning_rate_factor) * tf.stop_gradient(dau_mu2)

            [X,Y] = np.meshgrid(np.arange(max_kernel_size),np.arange(max_kernel_size))

            X = np.reshape(X,(max_kernel_size*max_kernel_size,1,1,1)) - int(max_kernel_size/2)
            Y = np.reshape(Y,(max_kernel_size*max_kernel_size,1,1,1)) - int(max_kernel_size/2)

            X = X.astype(np.float32)
            Y = Y.astype(np.float32)

            # Gaussian kernel
            X = tf.convert_to_tensor(X,name='X',dtype=tf.float32)
            Y = tf.convert_to_tensor(Y,name='Y',dtype=tf.float32)

            gauss_kernel = tf.exp(-1* (tf.pow(X - dau_mu1,2.0) + tf.pow(Y - dau_mu2,2.0)) / (2.0*tf.pow(dau_sigma,2.0)),name='gauss_kernel')

            gauss_kernel_sum = tf.reduce_sum(gauss_kernel,axis=0, keep_dims=True,name='guass_kernel_sum')

            gauss_kernel_norm = tf.divide(gauss_kernel, gauss_kernel_sum ,name='gauss_kernel_norm')

            # normalize to sum of 1 and add weight
            gauss_kernel_norm = tf.multiply(dau_w, gauss_kernel_norm,name='gauss_kernel_weight')

            # sum over Gaussian units
            gauss_kernel_norm = tf.reduce_sum(gauss_kernel_norm, axis=2, keep_dims=True,name='gauss_kernel_sum_units')

            # convert to [Kw,Kh,S,F] shape
            gauss_kernel_norm = tf.reshape(gauss_kernel_norm, (max_kernel_size, max_kernel_size, gauss_kernel_norm.shape[1], gauss_kernel_norm.shape[3]),name='gauss_kernel_reshape')

            return gauss_kernel_norm

        try:
            # try with XLA if exists
            from tensorflow.contrib.compiler import xla

            gauss_kernel_norm = xla.compile(computation=get_kernel_fn, inputs=(self.dau_weights, self.dau_mu1, self.dau_mu2, self.dau_sigma, self.max_kernel_size, self.mu_learning_rate_factor))[0]

        except:
            # otherwise revert to direct method call
            gauss_kernel_norm = get_kernel_fn(self.dau_weights, self.dau_mu1, self.dau_mu2, self.dau_sigma, self.max_kernel_size, self.mu_learning_rate_factor)

        outputs = self._convolution_op(inputs, gauss_kernel_norm)

        if self.use_bias:
            if self.data_format == 'channels_first':
                if self.rank == 1:
                    # nn.bias_add does not accept a 1D input tensor.
                    bias = array_ops.reshape(self.bias, (1, self.filters, 1))
                    outputs += bias
                if self.rank == 2:
                    outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
                if self.rank == 3:
                    # As of Mar 2017, direct addition is significantly slower than
                    # bias_add when computing gradients. To use bias_add, we collapse Z
                    # and Y into a single dimension to obtain a 4D input tensor.
                    outputs_shape = outputs.shape.as_list()
                    if outputs_shape[0] is None:
                        outputs_shape[0] = -1
                    outputs_4d = array_ops.reshape(outputs,
                                                   [outputs_shape[0], outputs_shape[1],
                                                    outputs_shape[2] * outputs_shape[3],
                                                    outputs_shape[4]])
                    outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
                    outputs = array_ops.reshape(outputs_4d, outputs_shape)
            else:
                outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')

        if self.activation is not None:
            return self.activation(outputs)
        return outputs
Пример #20
0
def compile(computation, inputs=None):
    """Builds an operator that compiles and runs `computation` with the Graphcore
     IPU XLA backend.

  Args:
    computation: A Python function that builds a computation to apply to the
      input. If the function takes n inputs, `inputs` should be a list of n
      tensors.

      `computation` may return a list of operations and tensors.  Tensors must
      come before operations in the returned list.  The return value of
      `compile` is a list of tensors corresponding to the tensors from the
      output of `computation`.

      All `Operation`s returned from `computation` will be executed when
      evaluating any of the returned output tensors.
    inputs: A list of inputs or `None` (equivalent to an empty list). Each input
      can be a nested structure containing values that are convertible to
      tensors. Note that passing an N-dimension list of compatible values will
      result in a N-dimension list of scalar tensors rather than a single Rank-N
      tensors. If you need different behaviour, convert part of inputs to
      tensors with `tf.convert_to_tensor`.

  Returns:
    Same data structure as if computation(inputs) is called directly with some
    exceptions for correctness.

    1. None output. a NoOp would be returned which control-depends on
       computation.
    2. Single value output. A tuple containing the value would be returned.
    3. Operation-only outputs. a NoOp would be returned which
       control-depends on computation.

  Raises:
    Exception: If the computation was not compiled for an IPU device.
  """
    old_op_list = ops.get_default_graph().get_operations()
    result = xla.compile(computation, inputs)

    new_op_list = ops.get_default_graph().get_operations()

    added_ops = set(old_op_list) ^ set(new_op_list)
    # Go over all the new added ops, check that they have been placed on an IPU
    # device.
    placed_on_ipu = False
    all_no_ops = True
    for o in added_ops:
        if o.device.startswith('/device:IPU'):
            placed_on_ipu = True
            break
        elif o.type != 'NoOp':
            all_no_ops = False

    if not placed_on_ipu and not all_no_ops:
        raise Exception("""\
A computation has been compiled, however it was not placed on an IPU device. \
This computation will not be executed on an IPU.
To execute it on an IPU use the `ipu_scope` from `tensorflow.contrib.ipu.ops`, \
for example:

  with ipu_scope('/device:IPU:0'):
    result = ipu_compiler.compile(comp, inputs)
""")
    return result
Пример #21
0
def validation_graph(model, opts):
    reconfigure = not opts.get('reuse_IPUs', False)
    if opts['use_popdist'] and reconfigure:
        hvd.init()

    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_dataset = dataset.data(
            opts, is_training=False).map(lambda x: {'data_dict': x})

        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            valid_dataset, prefetch_depth=opts['prefetch_depth'])

        if opts['latency']:
            timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    if opts['latency']:
                        timestamp_enqueue = timestamp_queue.enqueue(
                            data_dict['timestamp'])
                        return (total_accuracy +
                                (tf.cast(accuracy, tf.float32) /
                                 opts["validation_batches_per_step"]),
                                timestamp_enqueue)
                    else:
                        return total_accuracy + (
                            tf.cast(accuracy, tf.float32) /
                            opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['total_replicas'] * opts['shards'] > 1 and not opts.get(
                        'inference', False):
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['total_replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        if opts['latency']:
            print(f'relative_timer start {relative_timer.get_start()}')
            timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(),
                                tf.float32)
            latency_per_batch = tf.reshape(
                timestamp - timestamp_queue.dequeue(), [-1])
        else:
            latency_per_batch = None

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

        if opts['use_popdist']:
            broadcast_weights = []
            for var in tf.global_variables():
                broadcast_weights.append(
                    var.assign(hvd.broadcast(var, root_rank=0)))
            global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph,
                                                        root_rank=0)
            num_files_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0)
            iteration_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0)
        else:
            broadcast_weights = None
            broadcast_global_batch_size, global_batch_size_ph = None, None
            broadcast_num_files, num_files_ph = None, None
            broadcast_iteration, iteration_ph = None, None

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=False,  # disable Stochastic Rounding for validation
        shards=opts['shards'],
        number_of_replicas=opts['total_replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        half_partials=opts["enable_half_partials"],
        conv_dithering=opts["enable_conv_dithering"],
        enable_recomputation=opts["enable_recomputation"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"],
        compile_only=opts["compile_only"],
        internalExchangeOptimisationTarget=opts[
            "internal_exchange_optimisation_target"],
        num_io_tiles=opts["num_io_tiles"],
        number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1),
        nanoo=not opts["saturate_on_overflow"],
    )

    if opts['use_popdist'] and reconfigure:
        ipu_options = popdist.tensorflow.set_ipu_config(ipu_options,
                                                        opts['shards'],
                                                        configure_device=False)

    if opts['on_demand'] and reconfigure:
        ipu_options.device_connection.enable_remote_buffers = True
        ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND

    if reconfigure:
        ipu_options.configure_ipu_system()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    ops = {
        'accuracy': accuracy,
        'broadcast_weights': broadcast_weights,
        'broadcast_global_batch_size': broadcast_global_batch_size,
        'broadcast_num_files': broadcast_num_files,
        'broadcast_iteration': broadcast_iteration,
        'latency_per_batch': latency_per_batch
    }

    placeholders = {
        'global_batch_size': global_batch_size_ph,
        'num_files': num_files_ph,
        'iteration': iteration_ph
    }

    valid_graph.finalize()

    return train.GraphOps(valid_graph, valid_sess, valid_init, ops,
                          placeholders, valid_iterator, None, valid_saver)