示例#1
0
def set_distribution_defaults(opts):
    if opts['distributed'] and opts['use_popdist']:
        raise ValueError("Cannot use popdist with --distributed")

    if opts['distributed']:
        # Read the cluster config from the `TF_CONFIG` environment variable
        cluster = tf.distribute.cluster_resolver.TFConfigClusterResolver()

        # Allow `mpirun` to override the task index
        cluster.task_id = os.getenv("OMPI_COMM_WORLD_RANK")
        cluster.task_type = "worker"

        opts['distributed_worker_count'] = cluster.cluster_spec().num_tasks(
            "worker")
        opts['distributed_worker_index'] = cluster.task_id
        opts['distributed_cluster'] = cluster.cluster_spec().as_dict()

        opts['summary_str'] += 'Distribution\n'
        opts['summary_str'] += ' Worker count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Worker index: {distributed_worker_index}\n'
        opts['summary_str'] += ' Cluster: {distributed_cluster}\n'
    elif opts['use_popdist']:
        opts['distributed_worker_count'] = int(popdist.getNumTotalReplicas() /
                                               popdist.getNumLocalReplicas())
        opts['distributed_worker_index'] = int(
            popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas())
        opts['distributed_cluster'] = None

        opts['summary_str'] += 'Popdist\n'
        opts['summary_str'] += ' Process count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Process index: {distributed_worker_index}\n'
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0
        opts['distributed_cluster'] = None
示例#2
0
def set_popdist_args(args):
    if not popdist.isPopdistEnvSet():
        args.use_popdist = False
        args.popdist_size = 1
        args.popdist_rank = 0
        return

    if args.inference:
        raise RuntimeError("Distributed execution is only supported for training")

    try:
        import horovod.popart as hvd
        hvd.init()
    except ImportError:
        raise ImportError("Could not find the PopART horovod extension. "
                          "Please install the horovod .whl provided in the Poplar SDK.")

    args.use_popdist = True
    popdist_local_factor = popdist.getNumLocalReplicas()
    if args.replication_factor > 1 and args.replication_factor != popdist_local_factor:
        logger.warning(f"Overwriting the local replication factor {args.replication_factor} to {popdist_local_factor}")
    args.replication_factor = popdist_local_factor

    args.popdist_size = popdist.getNumTotalReplicas() // popdist.getNumLocalReplicas()
    args.popdist_rank = popdist.getReplicaIndexOffset() // popdist.getNumLocalReplicas()
    args.checkpoint_dir = args.checkpoint_dir + "_rank_" + str(args.popdist_rank)

    from mpi4py import MPI
    setup_comm(MPI.COMM_WORLD)
示例#3
0
def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replication_factor:
        print(f"The number of replicas is overridden by PopRun. "
              f"The new value is {popdist.getNumTotalReplicas()}.")
    args.replication_factor = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
示例#4
0
def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replicas:
        logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.")
    args.replicas = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
    args.popdist_local_rank = hvd.local_rank()
示例#5
0
    # Add pretraining-specific command line options here.
    return parser


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.ERROR)

    opts = make_global_options([add_pretraining_options])

    opts['shards'] = ipu_utils.next_power_of_two(
        max(opts["device_mapping"]) + 1)

    if popdist.isPopdistEnvSet():
        opts['use_popdist'] = True
        opts['replicas'] = popdist.getNumLocalReplicas()
        opts['total_replicas'] = popdist.getNumTotalReplicas()
        if opts['compile_only']:
            opts['select_ipu'] = None
        else:
            opts['select_ipu'] = popdist.getDeviceId()
    else:
        opts['use_popdist'] = False
        opts['total_replicas'] = opts['replicas']
        opts['select_ipu'] = None

    set_defaults(opts)

    set_poplar_engine_options(execution_profile=opts['execution_profile'],
                              memory_profile=opts['memory_profile'],
                              profile_dir=str(opts['profile_dir']),
                              sync_replicas_independently=opts['replicas'] > 1
示例#6
0
    parser.add_argument("--init_weight",
                        type=str,
                        default="./ckpt_init/yolov3_coco_converted.fp16.ckpt",
                        help="ckpt init weight")

    arguments = parser.parse_args()
    with open(arguments.config) as f:
        opts = json.load(f)

    opts['train']['annot_path'] = arguments.train_path
    opts['train']['initial_weight'] = arguments.init_weight
    opts['test']['annot_path'] = arguments.test_path
    if popdist.isPopdistEnvSet():
        opts["use_popdist"] = True
        opts["train"]["replicas"] = popdist.getNumLocalReplicas()
        opts["train"]["total_replicas"] = popdist.getNumTotalReplicas()
        opts["select_ipu"] = popdist.getDeviceId(
            len(opts["train"]["device_mapping"]))
        opts["distributed_worker_count"] = int(popdist.getNumTotalReplicas() /
                                               popdist.getNumLocalReplicas())
        opts["distributed_worker_index"] = int(
            popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas())
        opts["use_popdist"] = True

    else:
        opts["use_popdist"] = False
        opts["train"]["total_replicas"] = opts["train"]["replicas"]
        opts["select_ipu"] = -1
        opts["distributed_worker_count"] = 1
        opts["distributed_worker_index"] = 0
        opts["use_popdist"] = False
示例#7
0
def bert_session_options(args, model):
    engine_options = {}
    options = popart.SessionOptions()
    options.virtualGraphMode = popart.VirtualGraphMode.Manual
    options.enableFloatingPointChecks = args.floating_point_exceptions
    options.enableStochasticRounding = args.stochastic_rounding
    options.enablePrefetchDatastreams = not args.minimum_latency_inference

    # These options are necessary to allow poplar to overlap processing of
    # multiple iterations in the host side
    options.defaultPrefetchBufferingDepth = 3
    options.rearrangeAnchorsOnHost = False
    engine_options["exchange.streamBufferOverlap"] = "hostRearrangeOnly"

    options.enableOutlining = not args.no_outlining
    options.subgraphCopyingStrategy = popart.SubgraphCopyingStrategy.JustInTime
    partials_type = "half" if args.enable_half_partials else "float"
    options.partialsTypeMatMuls = partials_type
    options.convolutionOptions = {'partialsType': partials_type}
    if args.replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = args.replication_factor
        engine_options["target.syncReplicasIndependently"] = "true"
    if args.use_popdist:
        popdist.popart.configureSessionOptions(options)
    # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops
    # such as add or reshapeInplace.
    # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation.
    options.outlineThreshold = 10.0
    if args.pipeline:
        options.enablePipelining = True
        options.autoRecomputation = popart.RecomputationType.Pipeline
        if args.recompute_checkpoint_every_layer and any(
                map(lambda l: l > 1, args.layers_per_ipu)):
            options.scheduleNonWeightUpdateGradientConsumersEarly = True

    options.optimizerStateTensorLocationSettings = bert_optimizer_location_settings(
        args)

    # RTS to shard optimizer states with multiple IPU Pods
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    if num_total_replicas > num_local_replicas and args.replicated_tensor_sharding:
        # Fewer elements would not make sense to shard
        options.optimizerStateTensorLocationSettings.minElementsForReplicatedTensorSharding = num_local_replicas
        sharding_domain = popart.CommGroup(popart.CommGroupType.Consecutive,
                                           num_local_replicas)

        # Ensure all related tensors have the same sharding domain set
        options.weightTensorLocationSettings.location.shardingDomain = sharding_domain
        options.optimizerStateTensorLocationSettings.location.shardingDomain = sharding_domain
        options.accumulatorTensorLocationSettings.location.shardingDomain = sharding_domain

    if "Mean" in args.gradient_reduction_type:
        options.accumulationAndReplicationReductionType = popart.ReductionType.Mean
        options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Post
        if args.gradient_reduction_type == "RunningMean":
            options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Running

    if args.gradient_accumulation_factor > 1:
        options.enableGradientAccumulation = True
        options.accumulationFactor = args.gradient_accumulation_factor

        # When not replicated SyncPattern.SinglePipeline will provide better overlap
        # than this option.
        if device_is_replicated(args):
            if args.optimizer_state_offchip:
                options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings(
                    popart.AccumulateOuterFragmentSchedule.
                    OverlapMemoryOptimized, [0])
            elif args.replicated_tensor_sharding:
                # With OnChip + RTS this will cluster optimizer steps into
                # schedule bins. Improving outlining and scheduling time.
                options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings(
                    popart.AccumulateOuterFragmentSchedule.
                    OverlapMemoryOptimized)

    if args.engine_cache is not None:
        options.enableEngineCaching = True
        options.cachePath = args.engine_cache
    if args.profile:
        options.enableEngineCaching = False
    options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count
    options.disableGradAccumulationTensorStreams = not args.save_initializers_externally
    if args.max_copy_merge_size == -1:
        logger.debug("No copy merge size limit applied")
    else:
        logger.warning(
            f"Copy merge size limit set to {args.max_copy_merge_size}")
        engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size)

    # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large
    # transposes before operations.
    if args.disable_fully_connected_pass:
        if args.task == "SQUAD" and args.sequence_length == 384:
            logger.warning(
                "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM."
            )
        options.enableFullyConnectedPass = False

    if args.inference and args.engine_cache is not None and not args.variable_weights_inference:
        logger.warning(
            "Using engine cache with constant weights. Checkpoint weights will be ignored. "
            "Use the `--variable-weights-inference` flag if checkpoint weights should be used."
        )

    if args.variable_weights_inference:
        options.constantWeights = False

    if args.group_host_syncs:
        options.groupHostSync = True

    if args.internal_exchange_optimisation_target is not None:
        engine_options["opt.internalExchangeOptimisationTarget"] = str(
            args.internal_exchange_optimisation_target)

    options.engineOptions = engine_options

    # Set synthetic data mode (if active)
    if args.synthetic_data:
        if args.synthetic_data_initializer == "zeros":
            options.syntheticDataMode = popart.SyntheticDataMode.Zeros
        else:
            options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal
        logger.info(
            f"Running with Synthetic Data Type '{options.syntheticDataMode}'")
    return options
示例#8
0
    checkpoint_dir = args.checkpoint_dir
    label_smoothing = args.label_smoothing
    optimizer_name = args.optimizer
    optimizer_params = args.optimizer_params
    seed = args.seed
    internal_exchange_optimization_target = args.internal_exchange_optimization_target
    max_cross_replica_buffer_size = args.max_cross_replica_buffer_size
    max_reduce_many_buffer_size = args.max_reduce_many_buffer_size
    gather_conv_output = args.gather_conv_output
    pipeline_num_parallel = args.pipeline_num_parallel

    # check if the script has been called by poprun
    distributed_training = popdist.isPopdistEnvSet()

    if distributed_training:
        if num_replicas != popdist.getNumTotalReplicas():
            logging.warning(
                f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) '
                f'does not match the config (=={num_replicas}). Poprun will override the config.'
            )
            num_replicas = popdist.getNumTotalReplicas()

        max_threads_per_instance = os.cpu_count() // popdist.getNumInstances()
        if pipeline_num_parallel > max_threads_per_instance:
            logging.warning(
                f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads '
                f'divided by the number of instances,  Poprun will override the config. '
            )
            # Limit the maximal number of threads to the total of physical threads divided by the number of instances
            pipeline_num_parallel = max_threads_per_instance
示例#9
0
            def gradient_normalizer(grads_and_vars):                return \
[(grad / popdist.getNumTotalReplicas() / batch_config.gradient_accumulation_count, var)
                for grad, var in grads_and_vars]

        optimizer_params['learning_rate'] = lr_scheduler
示例#10
0
def replicated_tensor_sharding_core():
    parser = argparse.ArgumentParser(description="Parse launch parameters.")
    parser.add_argument("--tensors", nargs="*")
    parser.add_argument("--optim", nargs="?")
    parser.add_argument("--tmpdir", nargs="?")
    parser.add_argument("--filename", nargs="?")
    parser.add_argument("--compute_batch", nargs="?")
    args = parser.parse_args(sys.argv[2:])

    ipus_per_replica = 1

    batches_per_step = 10
    accumulation_factor = 4
    compute_batch = int(args.compute_batch)
    hidden_size = 4
    reduction = popart.ReductionType.Sum

    deviceInfo = popdist.popart.getDevice(ipus_per_replica)
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    builder = popart.Builder()

    np.random.seed(12321)
    weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32)

    input_data = []
    label_data = []

    for i in range(
            0, batches_per_step * num_local_replicas * accumulation_factor *
            compute_batch):
        np.random.seed(popdist.getInstanceIndex() +
                       i * popdist.getNumInstances())
        input_data += [np.random.rand(hidden_size).astype(np.float32)]
        label_data += [np.random.randint(0, hidden_size, size=1)]

    input_data = np.concatenate(input_data)
    label_data = np.concatenate(label_data)

    builder = popart.Builder()

    d0 = builder.addInputTensor(
        popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0")
    l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )),
                                "l0")

    data = {}

    data[d0] = input_data.reshape((batches_per_step, num_local_replicas,
                                   accumulation_factor, compute_batch, -1))

    w0 = builder.addInitializedInputTensor(weight_data, 'weight0')
    x = builder.aiOnnx.matmul([d0, w0])

    x = builder.aiOnnx.softmax([x])

    data[l0] = label_data.reshape((batches_per_step,
                    num_local_replicas,
                    accumulation_factor,
                    compute_batch,
                    -1))\
                .astype(np.uint32)
    loss = builder.aiGraphcore.nllloss([x, l0],
                                       reduction=reduction,
                                       debugContext='loss')

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(
        batches_per_step,
        {av: popart.AnchorReturnType("ALL")
         for av in [x, loss]})

    opts = popart.SessionOptions()
    if accumulation_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accumulation_factor
    opts.explicitRecomputation = True
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    # Let popdist handle distributed settings, such as:
    # opts.enableDistributedReplicatedGraphs
    # opts.globalReplicaOffset
    # opts.globalReplicationFactor
    popdist.popart.configureSessionOptions(opts)

    for tensor in ["weight", "optimizerState", "accumulator"]:
        userOption = tensor + "TensorLocationSettings"
        print(
            f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}"
        )
        locationSetting = getattr(opts, userOption)
        locationSetting.minElementsForOffChip = 0
        locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas
        if tensor in args.tensors:
            locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On
        if num_total_replicas > num_local_replicas:
            locationSetting.location.shardingDomain = popart.CommGroup(
                popart.CommGroupType.Consecutive, num_local_replicas)
        setattr(opts, userOption, locationSetting)

    if args.optim == "Adam":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, False),
            },
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.LambNoBias)
    if args.optim == "SGD":
        optimizer = popart.ConstSGD(0.01)

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     deviceInfo=deviceInfo,
                                     userOptions=opts,
                                     loss=loss,
                                     optimizer=optimizer)

    session.prepareDevice()

    session.weightsFromHost()

    anchors = session.initAnchorArrays()

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    tmp_path = Path(args.tmpdir)
    tmp_path.mkdir(parents=True, exist_ok=True)
    file_path = str(tmp_path / args.filename)
    session.modelToHost(file_path)
    post_proto = onnx.load(file_path)
示例#11
0
    def ipu_prog(num_replicas, gradient_accumulation):
        import logging
        import sys
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
        popdist_on = popdist.isPopdistEnvSet()

        num_global_replicas = popdist.getNumTotalReplicas(
        ) if popdist_on else num_replicas
        num_instances = popdist.getNumInstances() if popdist_on else 1

        dataset_size = global_batch_size = 16
        micro_batch_size = int(global_batch_size / num_global_replicas /
                               gradient_accumulation)

        X = np.arange(1, dataset_size + 1, 1, dtype=float)
        Y = [0] * dataset_size
        ds = tf.data.Dataset.from_tensor_slices((X, Y))
        if popdist_on:
            ds = ds.shard(num_instances, index=popdist.getInstanceIndex())
        ds = ds.batch(micro_batch_size, drop_remainder=True)
        ds = ds.repeat()

        cfg = ipu.config.IPUConfig()
        if popdist_on:
            cfg = popdist.tensorflow.set_ipu_config(
                cfg,
                ipus_per_replica=popdist.getNumIpusPerReplica(),
                configure_device=True)
            hvd.init()
        else:
            cfg.auto_select_ipus = num_global_replicas
        cfg.configure_ipu_system()

        strategy = popdist_strategy.PopDistStrategy(
        ) if popdist_on else ipu.ipu_strategy.IPUStrategy()

        with strategy.scope():

            def get_model():
                input_layer = tf.keras.Input(shape=1)
                kernel_initializer = tf.keras.initializers.Constant(1)
                x = tf.keras.layers.Dense(
                    1, use_bias=False,
                    kernel_initializer=kernel_initializer)(input_layer)
                return tf.keras.Model(input_layer, x)

            model = get_model()
            model.set_gradient_accumulation_options(
                gradient_accumulation_steps_per_replica=gradient_accumulation)
            model.build(input_shape=(micro_batch_size, 1))

            if popdist_on:

                def gradient_normalizer(grads_and_vars):
                    return [(grad / gradient_accumulation, var)
                            for grad, var in grads_and_vars]
            else:

                def gradient_normalizer(grads_and_vars):
                    return [
                        (grad / num_global_replicas / gradient_accumulation,
                         var) for grad, var in grads_and_vars
                    ]

            optimizer = tf.keras.optimizers.SGD(
                learning_rate=1.0, gradient_transformers=[gradient_normalizer])

            loss_class = tf.keras.losses.MeanSquaredError
            loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
            loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue)
            loss = loss_class()

            micro_batches_per_weight_update = num_global_replicas * gradient_accumulation
            steps_per_execution = dataset_size // (
                micro_batch_size * micro_batches_per_weight_update
            ) * micro_batches_per_weight_update

            model.compile(optimizer=optimizer,
                          loss=loss,
                          metrics=[tf.keras.losses.MSE],
                          steps_per_execution=steps_per_execution)

            callbacks = [
                OutFeedQueueCallback(queue=loss_outfeed_queue,
                                     name='average_loss')
            ]
            if num_instances > 1:
                callbacks += [AllReduceMetricsCallback()]
            callbacks += [LoggingCallback(1)]

            model.fit(ds,
                      steps_per_epoch=steps_per_execution,
                      callbacks=callbacks)

            return model.get_weights()[0][0][0]