def set_distribution_defaults(opts): if opts['distributed'] and opts['use_popdist']: raise ValueError("Cannot use popdist with --distributed") if opts['distributed']: # Read the cluster config from the `TF_CONFIG` environment variable cluster = tf.distribute.cluster_resolver.TFConfigClusterResolver() # Allow `mpirun` to override the task index cluster.task_id = os.getenv("OMPI_COMM_WORLD_RANK") cluster.task_type = "worker" opts['distributed_worker_count'] = cluster.cluster_spec().num_tasks( "worker") opts['distributed_worker_index'] = cluster.task_id opts['distributed_cluster'] = cluster.cluster_spec().as_dict() opts['summary_str'] += 'Distribution\n' opts['summary_str'] += ' Worker count: {distributed_worker_count}\n' opts['summary_str'] += ' Worker index: {distributed_worker_index}\n' opts['summary_str'] += ' Cluster: {distributed_cluster}\n' elif opts['use_popdist']: opts['distributed_worker_count'] = int(popdist.getNumTotalReplicas() / popdist.getNumLocalReplicas()) opts['distributed_worker_index'] = int( popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas()) opts['distributed_cluster'] = None opts['summary_str'] += 'Popdist\n' opts['summary_str'] += ' Process count: {distributed_worker_count}\n' opts['summary_str'] += ' Process index: {distributed_worker_index}\n' else: opts['distributed_worker_count'] = 1 opts['distributed_worker_index'] = 0 opts['distributed_cluster'] = None
def set_popdist_args(args): if not popdist.isPopdistEnvSet(): args.use_popdist = False args.popdist_size = 1 args.popdist_rank = 0 return if args.inference: raise RuntimeError("Distributed execution is only supported for training") try: import horovod.popart as hvd hvd.init() except ImportError: raise ImportError("Could not find the PopART horovod extension. " "Please install the horovod .whl provided in the Poplar SDK.") args.use_popdist = True popdist_local_factor = popdist.getNumLocalReplicas() if args.replication_factor > 1 and args.replication_factor != popdist_local_factor: logger.warning(f"Overwriting the local replication factor {args.replication_factor} to {popdist_local_factor}") args.replication_factor = popdist_local_factor args.popdist_size = popdist.getNumTotalReplicas() // popdist.getNumLocalReplicas() args.popdist_rank = popdist.getReplicaIndexOffset() // popdist.getNumLocalReplicas() args.checkpoint_dir = args.checkpoint_dir + "_rank_" + str(args.popdist_rank) from mpi4py import MPI setup_comm(MPI.COMM_WORLD)
def init_popdist(args): hvd.init() args.use_popdist = True if popdist.getNumTotalReplicas() != args.replication_factor: print(f"The number of replicas is overridden by PopRun. " f"The new value is {popdist.getNumTotalReplicas()}.") args.replication_factor = int(popdist.getNumLocalReplicas()) args.popdist_rank = popdist.getInstanceIndex() args.popdist_size = popdist.getNumInstances()
def init_popdist(args): hvd.init() args.use_popdist = True if popdist.getNumTotalReplicas() != args.replicas: logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.") args.replicas = int(popdist.getNumLocalReplicas()) args.popdist_rank = popdist.getInstanceIndex() args.popdist_size = popdist.getNumInstances() args.popdist_local_rank = hvd.local_rank()
def set_popdist_args(args): if not popdist.isPopdistEnvSet(): logger.info("No PopRun detected. Using single instance training") else: logger.info("PopRun is detected") args.use_popdist = True num_total_replicas = popdist.popdist_core.getNumTotalReplicas() args.local_replication_factor = popdist.getNumLocalReplicas() args.num_instances = popdist.popdist_core.getNumInstances() assert(num_total_replicas == args.local_replication_factor * args.num_instances) args.instance_idx = popdist.popdist_core.getInstanceIndex() if args.replication_factor != num_total_replicas: raise RuntimeError(f"Replication factor({args.replication_factor}) " f"should match popdist replication factor ({num_total_replicas})") if args.samples_per_step % args.num_instances != 0: raise RuntimeError(f"The number of samples per step({args.samples_per_step}) " f"has to be a integer multiple of the number of instances({args.num_instances})")
group = parser.add_argument_group("Pretraining options") # Add pretraining-specific command line options here. return parser if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.ERROR) opts = make_global_options([add_pretraining_options]) opts['shards'] = ipu_utils.next_power_of_two( max(opts["device_mapping"]) + 1) if popdist.isPopdistEnvSet(): opts['use_popdist'] = True opts['replicas'] = popdist.getNumLocalReplicas() opts['total_replicas'] = popdist.getNumTotalReplicas() if opts['compile_only']: opts['select_ipu'] = None else: opts['select_ipu'] = popdist.getDeviceId() else: opts['use_popdist'] = False opts['total_replicas'] = opts['replicas'] opts['select_ipu'] = None set_defaults(opts) set_poplar_engine_options(execution_profile=opts['execution_profile'], memory_profile=opts['memory_profile'], profile_dir=str(opts['profile_dir']),
help="data path for test") parser.add_argument("--init_weight", type=str, default="./ckpt_init/yolov3_coco_converted.fp16.ckpt", help="ckpt init weight") arguments = parser.parse_args() with open(arguments.config) as f: opts = json.load(f) opts['train']['annot_path'] = arguments.train_path opts['train']['initial_weight'] = arguments.init_weight opts['test']['annot_path'] = arguments.test_path if popdist.isPopdistEnvSet(): opts["use_popdist"] = True opts["train"]["replicas"] = popdist.getNumLocalReplicas() opts["train"]["total_replicas"] = popdist.getNumTotalReplicas() opts["select_ipu"] = popdist.getDeviceId( len(opts["train"]["device_mapping"])) opts["distributed_worker_count"] = int(popdist.getNumTotalReplicas() / popdist.getNumLocalReplicas()) opts["distributed_worker_index"] = int( popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas()) opts["use_popdist"] = True else: opts["use_popdist"] = False opts["train"]["total_replicas"] = opts["train"]["replicas"] opts["select_ipu"] = -1 opts["distributed_worker_count"] = 1 opts["distributed_worker_index"] = 0
def bert_session_options(args, model): engine_options = {} options = popart.SessionOptions() options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enableFloatingPointChecks = args.floating_point_exceptions options.enableStochasticRounding = args.stochastic_rounding options.enablePrefetchDatastreams = not args.minimum_latency_inference # These options are necessary to allow poplar to overlap processing of # multiple iterations in the host side options.defaultPrefetchBufferingDepth = 3 options.rearrangeAnchorsOnHost = False engine_options["exchange.streamBufferOverlap"] = "hostRearrangeOnly" options.enableOutlining = not args.no_outlining options.subgraphCopyingStrategy = popart.SubgraphCopyingStrategy.JustInTime partials_type = "half" if args.enable_half_partials else "float" options.partialsTypeMatMuls = partials_type options.convolutionOptions = {'partialsType': partials_type} if args.replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = args.replication_factor engine_options["target.syncReplicasIndependently"] = "true" if args.use_popdist: popdist.popart.configureSessionOptions(options) # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops # such as add or reshapeInplace. # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation. options.outlineThreshold = 10.0 if args.pipeline: options.enablePipelining = True options.autoRecomputation = popart.RecomputationType.Pipeline if args.recompute_checkpoint_every_layer and any( map(lambda l: l > 1, args.layers_per_ipu)): options.scheduleNonWeightUpdateGradientConsumersEarly = True options.optimizerStateTensorLocationSettings = bert_optimizer_location_settings( args) # RTS to shard optimizer states with multiple IPU Pods num_local_replicas = popdist.getNumLocalReplicas() num_total_replicas = popdist.getNumTotalReplicas() if num_total_replicas > num_local_replicas and args.replicated_tensor_sharding: # Fewer elements would not make sense to shard options.optimizerStateTensorLocationSettings.minElementsForReplicatedTensorSharding = num_local_replicas sharding_domain = popart.CommGroup(popart.CommGroupType.Consecutive, num_local_replicas) # Ensure all related tensors have the same sharding domain set options.weightTensorLocationSettings.location.shardingDomain = sharding_domain options.optimizerStateTensorLocationSettings.location.shardingDomain = sharding_domain options.accumulatorTensorLocationSettings.location.shardingDomain = sharding_domain if "Mean" in args.gradient_reduction_type: options.accumulationAndReplicationReductionType = popart.ReductionType.Mean options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Post if args.gradient_reduction_type == "RunningMean": options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Running if args.gradient_accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = args.gradient_accumulation_factor # When not replicated SyncPattern.SinglePipeline will provide better overlap # than this option. if device_is_replicated(args): if args.optimizer_state_offchip: options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings( popart.AccumulateOuterFragmentSchedule. OverlapMemoryOptimized, [0]) elif args.replicated_tensor_sharding: # With OnChip + RTS this will cluster optimizer steps into # schedule bins. Improving outlining and scheduling time. options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings( popart.AccumulateOuterFragmentSchedule. OverlapMemoryOptimized) if args.engine_cache is not None: options.enableEngineCaching = True options.cachePath = args.engine_cache if args.profile: options.enableEngineCaching = False options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count options.disableGradAccumulationTensorStreams = not args.save_initializers_externally if args.max_copy_merge_size == -1: logger.debug("No copy merge size limit applied") else: logger.warning( f"Copy merge size limit set to {args.max_copy_merge_size}") engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size) # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large # transposes before operations. if args.disable_fully_connected_pass: if args.task == "SQUAD" and args.sequence_length == 384: logger.warning( "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM." ) options.enableFullyConnectedPass = False if args.inference and args.engine_cache is not None and not args.variable_weights_inference: logger.warning( "Using engine cache with constant weights. Checkpoint weights will be ignored. " "Use the `--variable-weights-inference` flag if checkpoint weights should be used." ) if args.variable_weights_inference: options.constantWeights = False if args.group_host_syncs: options.groupHostSync = True if args.internal_exchange_optimisation_target is not None: engine_options["opt.internalExchangeOptimisationTarget"] = str( args.internal_exchange_optimisation_target) options.engineOptions = engine_options # Set synthetic data mode (if active) if args.synthetic_data: if args.synthetic_data_initializer == "zeros": options.syntheticDataMode = popart.SyntheticDataMode.Zeros else: options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal logger.info( f"Running with Synthetic Data Type '{options.syntheticDataMode}'") return options
def replicated_tensor_sharding_core(): parser = argparse.ArgumentParser(description="Parse launch parameters.") parser.add_argument("--tensors", nargs="*") parser.add_argument("--optim", nargs="?") parser.add_argument("--tmpdir", nargs="?") parser.add_argument("--filename", nargs="?") parser.add_argument("--compute_batch", nargs="?") args = parser.parse_args(sys.argv[2:]) ipus_per_replica = 1 batches_per_step = 10 accumulation_factor = 4 compute_batch = int(args.compute_batch) hidden_size = 4 reduction = popart.ReductionType.Sum deviceInfo = popdist.popart.getDevice(ipus_per_replica) num_local_replicas = popdist.getNumLocalReplicas() num_total_replicas = popdist.getNumTotalReplicas() builder = popart.Builder() np.random.seed(12321) weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32) input_data = [] label_data = [] for i in range( 0, batches_per_step * num_local_replicas * accumulation_factor * compute_batch): np.random.seed(popdist.getInstanceIndex() + i * popdist.getNumInstances()) input_data += [np.random.rand(hidden_size).astype(np.float32)] label_data += [np.random.randint(0, hidden_size, size=1)] input_data = np.concatenate(input_data) label_data = np.concatenate(label_data) builder = popart.Builder() d0 = builder.addInputTensor( popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0") l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )), "l0") data = {} data[d0] = input_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1)) w0 = builder.addInitializedInputTensor(weight_data, 'weight0') x = builder.aiOnnx.matmul([d0, w0]) x = builder.aiOnnx.softmax([x]) data[l0] = label_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1))\ .astype(np.uint32) loss = builder.aiGraphcore.nllloss([x, l0], reduction=reduction, debugContext='loss') proto = builder.getModelProto() dataFlow = popart.DataFlow( batches_per_step, {av: popart.AnchorReturnType("ALL") for av in [x, loss]}) opts = popart.SessionOptions() if accumulation_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accumulation_factor opts.explicitRecomputation = True opts.enableExplicitMainLoops = True opts.useHostCopyOps = True # Let popdist handle distributed settings, such as: # opts.enableDistributedReplicatedGraphs # opts.globalReplicaOffset # opts.globalReplicationFactor popdist.popart.configureSessionOptions(opts) for tensor in ["weight", "optimizerState", "accumulator"]: userOption = tensor + "TensorLocationSettings" print( f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}" ) locationSetting = getattr(opts, userOption) locationSetting.minElementsForOffChip = 0 locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas if tensor in args.tensors: locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On if num_total_replicas > num_local_replicas: locationSetting.location.shardingDomain = popart.CommGroup( popart.CommGroupType.Consecutive, num_local_replicas) setattr(opts, userOption, locationSetting) if args.optim == "Adam": optimizer = popart.Adam( { "defaultLearningRate": (0.01, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "defaultEps": (1e-06, False), "defaultWeightDecay": (0.1, False), "lossScaling": (10, False), }, weight_decay_mode=popart.WeightDecayMode.Decay, mode=popart.AdamMode.LambNoBias) if args.optim == "SGD": optimizer = popart.ConstSGD(0.01) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=deviceInfo, userOptions=opts, loss=loss, optimizer=optimizer) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) tmp_path = Path(args.tmpdir) tmp_path.mkdir(parents=True, exist_ok=True) file_path = str(tmp_path / args.filename) session.modelToHost(file_path) post_proto = onnx.load(file_path)