示例#1
0
def training_graph(model, opts, iterations_per_step=1):

    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver)
def build_graph(bert_config,
                opts,
                iterations_per_step=1,
                is_training=True,
                feed_name=None):
    """Build the graph for training.

    Args:
        bert_config: configuration for the BERT model.
        opts: a dictionary containing all global options.
        iterations_per_step: number of iterations per step
        is_training (bool): if true return a graph with trainable variables.
        feed_name: name of the IPU infeed.

    Returns:
        a GraphOps containing a BERT graph and session prepared for inference or training.
    """
    train_graph = tf.Graph()
    with train_graph.as_default():

        placeholders = dict()
        placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                       shape=[])
        learning_rate = placeholders['learning_rate']

        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=is_training),
            feed_name=feed_name + "_in",
            replication_factor=opts['replicas'])

        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name + "_out", replication_factor=opts['replicas'])

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                bert_config,
                train_iterator,
                outfeed_queue,
                opts,
                learning_rate,
                iterations_per_step,
                is_training=is_training)

        outfeed = outfeed_queue.dequeue()

        bert_logging.print_trainable_variables(opts['logs_path'])

        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        model_and_optimiser_variables = tf.global_variables()

        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_ckpt'] else model_variables)

        # We store two savers: one for the standard training and another one for the best checkpoint
        savers = {
            "train_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='latest',
                           max_to_keep=5),
            "best_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='best',
                           max_to_keep=1)
        }

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # Calculate number of IPUs required for pretraining pipeline.
    num_embedding_ipu = {
        'two_ipus': 2,
        'same_ipu': 1,
        'same_as_hidden_layers': 0
    }[opts['embeddings_placement']]

    num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage)
    num_ipus_required = opts['replicas'] * next_power_of_two(
        num_hidden_layer_stages + num_embedding_ipu)

    # Configure the IPU options.
    ipu_options = get_ipu_config(
        fp_exceptions=opts["fp_exceptions"],
        stochastic_rounding=opts['stochastic_rounding'],
        xla_recompute=opts["xla_recompute"],
        available_memory_proportion=opts['available_memory_proportion'],
        disable_graph_outlining=opts["disable_graph_outlining"],
        num_ipus_required=num_ipus_required,
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        partials_type=opts['partials_type'])
    ipu.utils.configure_ipu_system(ipu_options)

    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, savers, restore, tvars)
示例#3
0
def training_graph(model, opts, iterations_per_step=1):
    train_graph = tf.Graph()
    sess_config = tf.ConfigProto()
    sess_target = None
    strategy = None

    if opts['distributed_cluster']:
        strategy, sess_target, sess_config = configure_distribution(
            opts, sess_config)

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()
        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)
        with tf.device('cpu'):
            profile_report = gen_ipu_ops.ipu_event_trace()
        ipu.utils.move_variable_initialization_to_cpu(graph=None)
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph,
                            config=sess_config,
                            target=sess_target)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, profile_report)
示例#4
0
def build_graph(opts, is_training=True):
    train_graph = tf.Graph()
    strategy = None

    if opts['use_popdist']:
        strategy = create_popdist_strategy()

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        if opts["groupbert"]:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.GroupBertConfig(vocab_size=None))
        else:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.BertConfig(vocab_size=None))

        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16

        # define placeholders
        placeholders = {
            'learning_rate': tf.placeholder(tf.float32, shape=[]),
            'loss_scaling': tf.placeholder(tf.float32, shape=[])
        }
        learning_rate = placeholders['learning_rate']
        loss_scaling = placeholders['loss_scaling']

        # define input, datasets must be defined outside the ipu device scope.
        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            data_loader.load(opts, is_training=is_training))
        # define output
        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()

        # building networks with pipeline
        def bert_net():
            return build_network(train_iterator, outfeed_queue, bert_config,
                                 opts, learning_rate, loss_scaling,
                                 is_training)

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, bert_config, opts,
                learning_rate, loss_scaling, is_training)

        # get result from outfeed queue
        outfeed = outfeed_queue.dequeue()

        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        if opts['distributed_worker_index'] == 0 or opts['log_all_workers']:
            log.print_trainable_variables(opts)

        model_and_optimiser_variables = tf.global_variables()
        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_checkpoint'] else model_variables)

        train_saver = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['save_optimiser_to_checkpoint'] else model_variables,
            max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # calculate the number of required IPU
    num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas']
    num_ipus = ipu_utils.next_power_of_two(num_ipus)

    ipu_config = ipu_utils.get_config(
        fp_exceptions=opts["fp_exceptions"],
        enable_recomputation=opts["enable_recomputation"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        minimum_remote_tensor_size=opts['min_remote_tensor_size'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        ipu_id=opts['select_ipu'])

    if opts['use_popdist']:
        ipu_config = popdist.tensorflow.set_ipu_config(ipu_config,
                                                       opts['shards'],
                                                       configure_device=False)

    # Do not acquire a device, compile only.
    if opts["compile_only"]:
        ipu_config.device_connection.version = "ipu2"
        ipu_config.device_connection.enable_remote_buffers = True
        # PRE_COMPILE allows for runing execuatables on graph without being online
        ipu_config.device_connection.type = DeviceConnectionType.PRE_COMPILE

        # Enforce using a exe cache dir, defaulting if not given
        if ("TF_POPLAR_FLAGS" in os.environ):
            if ("--executable_cache_path"
                    not in os.environ["TF_POPLAR_FLAGS"]):
                print(
                    "Warning: --executable_cache_path in TF_POPLAR_FLAGS " +
                    "(for 'poprun --mpi_local_args') not set. Setting to default "
                    + "path: ./tmp/tf_cache/")
                os.environ[
                    "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache"

        # Sometimes TF_POPLAR_FLAGS might not even exist
        else:
            print(
                "Warning: TF_POPLAR_FLAGS environment variable (for 'poprun " +
                "--mpi_local_args') not set. --executable_cache_path must be "
                +
                "defined when using --compile-only. Setting to default path: "
                + "./tmp/tf_cache/")
            os.environ[
                "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache"

    ipu_config.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)
示例#5
0
def build_graph(opts, iterations_per_step=1, is_training=True):

    train_graph = tf.Graph()
    with train_graph.as_default():
        if opts["groupbert"]:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.GroupBertConfig(vocab_size=None))
        else:
            bert_config = bert_ipu.BertConfig.from_dict(
                opts, config=bert_ipu.BertConfig(vocab_size=None))
        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16
        placeholders = dict()

        if is_training:
            placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                           shape=[])
            learning_rate = placeholders['learning_rate']
        else:
            learning_rate = None

        # Need to load the Glue File here
        label_list = opts["pass_in"][1]
        bert_config.num_lables = len(label_list)
        if opts['do_training'] and opts['current_mode'] == 'train':
            input_file = os.path.join(opts["output_dir"],
                                      f"train_{opts['task_type']}.tf_record")
        elif opts['do_eval'] and opts['current_mode'] == 'eval':
            input_file = os.path.join(opts["output_dir"],
                                      f"eval_{opts['task_type']}.tf_record")
        elif opts['do_predict'] and opts['current_mode'] == 'predict':
            input_file = os.path.join(
                opts["output_dir"], f"predict_{opts['task_type']}.tf_record")
        else:
            raise NotImplementedError()

        opts['input_file'] = input_file
        opts['drop_remainder'] = True

        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            data_loader.load(opts, is_training=is_training))
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        def bert_net():
            return build_network(train_iterator, outfeed_queue,
                                 iterations_per_step, bert_config, opts,
                                 learning_rate, is_training)

        with ipu_scope('/device:IPU:0'):
            train = ipu.ipu_compiler.compile(bert_net, [])

        outfeed = outfeed_queue.dequeue()

        log.print_trainable_variables(opts)

        restore = tf.train.Saver(var_list=tf.global_variables())
        train_saver = tf.train.Saver(max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()
    """calculate the number of required IPU"""
    num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas'])
    # The number of acquired IPUs must be the power of 2.
    if num_ipus & (num_ipus - 1) != 0:
        num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2)))
    ipu_config = get_config(
        fp_exceptions=opts["fp_exceptions"],
        enable_recomputation=opts["enable_recomputation"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'],
        scheduler_selection='CLUSTERING',
        compile_only=False,
        ipu_id=None,
        available_memory_proportion=opts["available_memory_proportion"])

    ipu_config.configure_ipu_system()

    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)
示例#6
0
def build_graph(opts, is_training=True, feed_name=None):
    train_graph = tf.Graph()
    strategy = None

    if opts['use_popdist']:
        strategy = create_popdist_strategy()

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        bert_config = bert_ipu.BertConfig.from_dict(opts)
        bert_config.dtype = tf.float32 if opts[
            "precision"] == '32' else tf.float16

        # define placeholders
        placeholders = {
            'learning_rate': tf.placeholder(bert_config.dtype, shape=[]),
            'loss_scaling': tf.placeholder(bert_config.dtype, shape=[])
        }
        learning_rate = placeholders['learning_rate']
        loss_scaling = placeholders['loss_scaling']

        # define input, datasets must be defined outside the ipu device scope.
        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            dataset.load(opts, is_training=is_training),
            feed_name=feed_name + "_in",
            replication_factor=opts['replicas'])
        # define output
        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name + "_out", replication_factor=opts['replicas'])

        # building networks with pipeline
        def bert_net():
            return build_network(train_iterator, outfeed_queue, bert_config,
                                 opts, learning_rate, loss_scaling,
                                 is_training)

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, bert_config, opts,
                learning_rate, loss_scaling, is_training)

        # get result from outfeed queue
        outfeed = outfeed_queue.dequeue()

        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        if opts['distributed_worker_index'] == 0 or opts['log_all_workers']:
            log.print_trainable_variables(opts)

        model_and_optimiser_variables = tf.global_variables()
        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_checkpoint'] else model_variables)

        train_saver = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['save_optimiser_to_checkpoint'] else model_variables,
            max_to_keep=5)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # calculate the number of required IPU
    num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas']
    num_ipus = ipu_utils.next_power_of_two(num_ipus)

    ipu_options = ipu_utils.get_config(
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        disable_graph_outlining=False,
        num_required_ipus=num_ipus,
        enable_stochastic_rounding=opts['stochastic_rounding'],
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        ipu_id=opts['select_ipu'])

    if opts['use_popdist']:
        ipu_options = popdist.tensorflow.set_ipu_config(ipu_options,
                                                        opts['shards'],
                                                        configure_device=False)

    ipu.utils.configure_ipu_system(ipu_options)

    # This is a workaround bug https://github.com/tensorflow/tensorflow/issues/23780
    from tensorflow.core.protobuf import rewriter_config_pb2
    sess_cfg = tf.ConfigProto()
    sess_cfg.graph_options.rewrite_options.memory_optimization = (
        rewriter_config_pb2.RewriterConfig.OFF)

    train_sess = tf.Session(graph=train_graph, config=sess_cfg)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, restore, tvars)