def __iter__(self): worker_info = torch.utils.data.get_worker_info() if worker_info is not None: if popdist.isPopdistEnvSet(): self.worker_id = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex( ) self.shard = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex( ), worker_info.num_workers * popdist.getNumInstances() else: self.worker_id = worker_info.id self.shard = worker_info.id, worker_info.num_workers else: self.shard = None self.reset() if self.shuffle: np.random.shuffle(self.files) return self
def init_popdist(args): hvd.init() args.use_popdist = True if popdist.getNumTotalReplicas() != args.replication_factor: print(f"The number of replicas is overridden by PopRun. " f"The new value is {popdist.getNumTotalReplicas()}.") args.replication_factor = int(popdist.getNumLocalReplicas()) args.popdist_rank = popdist.getInstanceIndex() args.popdist_size = popdist.getNumInstances()
def init_popdist(args): hvd.init() args.use_popdist = True if popdist.getNumTotalReplicas() != args.replicas: logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.") args.replicas = int(popdist.getNumLocalReplicas()) args.popdist_rank = popdist.getInstanceIndex() args.popdist_size = popdist.getNumInstances() args.popdist_local_rank = hvd.local_rank()
def set_distribution_defaults(opts): if opts['use_popdist']: opts['distributed_worker_count'] = popdist.getNumInstances() opts['distributed_worker_index'] = popdist.getInstanceIndex() else: opts['distributed_worker_count'] = 1 opts['distributed_worker_index'] = 0 if opts['distributed_worker_index'] != 0 and not opts['log_all_workers']: logger.setLevel(logging.ERROR)
def set_distribution_defaults(opts): if opts['use_popdist']: opts['distributed_worker_count'] = popdist.getNumInstances() opts['distributed_worker_index'] = popdist.getInstanceIndex() opts['distributed_cluster'] = None opts['summary_str'] += 'Popdist\n' opts['summary_str'] += ' Process count: {distributed_worker_count}\n' opts['summary_str'] += ' Process index: {distributed_worker_index}\n' else: opts['distributed_worker_count'] = 1 opts['distributed_worker_index'] = 0 opts['distributed_cluster'] = None
def set_distribution_defaults(opts): if opts['use_popdist']: opts['distributed_worker_count'] = popdist.getNumInstances() opts['distributed_worker_index'] = popdist.getInstanceIndex() opts['summary_str'] += 'Popdist\n' opts['summary_str'] += ' Process count: {distributed_worker_count}\n' opts['summary_str'] += ' Process index: {distributed_worker_index}\n' else: opts['distributed_worker_count'] = 1 opts['distributed_worker_index'] = 0 if opts['distributed_worker_index'] != 0 and not opts['log_all_workers']: logger.setLevel(logging.ERROR)
def get_dataset_from_directory(ds_path: str, split: str, seed=42): if not os.path.exists(ds_path): DataGenerator.logger.error(f'{ds_path} does not exist') raise NameError(f'Directory {ds_path} does not exist') builder = tfds.folder_dataset.ImageFolder(ds_path) info_ds = builder.info ds = builder.as_dataset(as_supervised=True, split=split) if not isinstance(ds, tf.data.Dataset): raise UnsupportedFormat( f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}' ) num_examples = DataGenerator.evaluate_size_dataset(ds) iterator = iter(ds) first_elem = iterator.get_next() if len(first_elem[0].shape) != 3: raise DimensionError( f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}' ) img_shape = first_elem[0].shape num_classes = -1 if len(info_ds.supervised_keys) == 2: label = info_ds.supervised_keys[1] num_classes = info_ds.features[label].num_classes else: raise UnsupportedFormat( f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}' ) print( f'img shape {img_shape} number of examples {num_examples} number of classes {num_classes}' ) if popdist.getNumInstances() > 1: ds = ds.shard(num_shards=popdist.getNumInstances(), index=popdist.getInstanceIndex()) return ds, img_shape, num_examples, num_classes
def get_imagenet(path: str, split: str, cycle_length: int = 4, block_length: int = 4): # The path is the one of dataset under TFRecord format if not os.path.exists(path): DataGenerator.logger.error(f'{path} does not exist') raise NameError(f'Directory {path} does not exist') if split == 'train': filenames = glob.glob1(path, 'train*') if len(filenames) != 1024: DataGenerator.logger.error( f'train directory should contain 1024 tf-record files but it contains {len(filenames)} instead' ) raise ValueError( f'train directory should contain 1024 files but it contains {len(filenames)} instead' ) else: filenames = glob.glob1(path, 'validation*') if len(filenames) != 128: DataGenerator.logger.error( f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead' ) raise ValueError( f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead' ) num_files = len(filenames) filenames = list( map(lambda filename: os.path.join(path, filename), filenames)) DataGenerator.logger.debug(f'filenames = {filenames}') ds = tf.data.Dataset.from_tensor_slices(filenames) if split == 'train': # Shuffle the input files ds = ds.shuffle(buffer_size=num_files) if popdist.getNumInstances() > 1: ds = ds.shard(num_shards=popdist.getNumInstances(), index=popdist.getInstanceIndex()) ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=cycle_length, block_length=block_length, num_parallel_calls=cycle_length) DataGenerator.logger.info(f'dataset = {ds}') num_examples = IMAGENET_DS_SIZE[split] DataGenerator.logger.info(f'number of examples {num_examples}') iterator = iter(ds) first_elem = iterator.get_next() feature, _ = imagenet_processing.parse_record(first_elem, True, tf.float32) if len(feature.shape) != 3: raise DimensionError( f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}' ) num_classes = 1000 ds = ds.cache() return ds, feature.shape, num_examples, num_classes
def logger(msg): if not popdist.isPopdistEnvSet() or popdist.getInstanceIndex() == 0: logging.info(msg)
logging.warning( f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) ' f'does not match the config (=={num_replicas}). Poprun will override the config.' ) num_replicas = popdist.getNumTotalReplicas() max_threads_per_instance = os.cpu_count() // popdist.getNumInstances() if pipeline_num_parallel > max_threads_per_instance: logging.warning( f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads ' f'divided by the number of instances, Poprun will override the config. ' ) # Limit the maximal number of threads to the total of physical threads divided by the number of instances pipeline_num_parallel = max_threads_per_instance if popdist.getInstanceIndex() != 0: checkpoints = False log_to_wandb = False # when neither option is specified, assume gradient accumulation count 1 if gradient_accumulation_count is None and global_batch_size is None: gradient_accumulation_count = 1 if recomputation and not len(pipeline_splits): raise ValueError('Recomputation requires a pipelined model. ' 'Make sure "--pipeline-splits" is defined') if logs_per_epoch < 0: raise ValueError( f'--logs-per-epoch should be non-negative (>=0), it is {logs_per_epoch}' )
def replicated_tensor_sharding_core(): parser = argparse.ArgumentParser(description="Parse launch parameters.") parser.add_argument("--tensors", nargs="*") parser.add_argument("--optim", nargs="?") parser.add_argument("--tmpdir", nargs="?") parser.add_argument("--filename", nargs="?") parser.add_argument("--compute_batch", nargs="?") args = parser.parse_args(sys.argv[2:]) ipus_per_replica = 1 batches_per_step = 10 accumulation_factor = 4 compute_batch = int(args.compute_batch) hidden_size = 4 reduction = popart.ReductionType.Sum deviceInfo = popdist.popart.getDevice(ipus_per_replica) num_local_replicas = popdist.getNumLocalReplicas() num_total_replicas = popdist.getNumTotalReplicas() builder = popart.Builder() np.random.seed(12321) weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32) input_data = [] label_data = [] for i in range( 0, batches_per_step * num_local_replicas * accumulation_factor * compute_batch): np.random.seed(popdist.getInstanceIndex() + i * popdist.getNumInstances()) input_data += [np.random.rand(hidden_size).astype(np.float32)] label_data += [np.random.randint(0, hidden_size, size=1)] input_data = np.concatenate(input_data) label_data = np.concatenate(label_data) builder = popart.Builder() d0 = builder.addInputTensor( popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0") l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )), "l0") data = {} data[d0] = input_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1)) w0 = builder.addInitializedInputTensor(weight_data, 'weight0') x = builder.aiOnnx.matmul([d0, w0]) x = builder.aiOnnx.softmax([x]) data[l0] = label_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1))\ .astype(np.uint32) loss = builder.aiGraphcore.nllloss([x, l0], reduction=reduction, debugContext='loss') proto = builder.getModelProto() dataFlow = popart.DataFlow( batches_per_step, {av: popart.AnchorReturnType("ALL") for av in [x, loss]}) opts = popart.SessionOptions() if accumulation_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accumulation_factor opts.explicitRecomputation = True opts.enableExplicitMainLoops = True opts.useHostCopyOps = True # Let popdist handle distributed settings, such as: # opts.enableDistributedReplicatedGraphs # opts.globalReplicaOffset # opts.globalReplicationFactor popdist.popart.configureSessionOptions(opts) for tensor in ["weight", "optimizerState", "accumulator"]: userOption = tensor + "TensorLocationSettings" print( f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}" ) locationSetting = getattr(opts, userOption) locationSetting.minElementsForOffChip = 0 locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas if tensor in args.tensors: locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On if num_total_replicas > num_local_replicas: locationSetting.location.shardingDomain = popart.CommGroup( popart.CommGroupType.Consecutive, num_local_replicas) setattr(opts, userOption, locationSetting) if args.optim == "Adam": optimizer = popart.Adam( { "defaultLearningRate": (0.01, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "defaultEps": (1e-06, False), "defaultWeightDecay": (0.1, False), "lossScaling": (10, False), }, weight_decay_mode=popart.WeightDecayMode.Decay, mode=popart.AdamMode.LambNoBias) if args.optim == "SGD": optimizer = popart.ConstSGD(0.01) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=deviceInfo, userOptions=opts, loss=loss, optimizer=optimizer) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) tmp_path = Path(args.tmpdir) tmp_path.mkdir(parents=True, exist_ok=True) file_path = str(tmp_path / args.filename) session.modelToHost(file_path) post_proto = onnx.load(file_path)
def ipu_prog(num_replicas, gradient_accumulation): import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) popdist_on = popdist.isPopdistEnvSet() num_global_replicas = popdist.getNumTotalReplicas( ) if popdist_on else num_replicas num_instances = popdist.getNumInstances() if popdist_on else 1 dataset_size = global_batch_size = 16 micro_batch_size = int(global_batch_size / num_global_replicas / gradient_accumulation) X = np.arange(1, dataset_size + 1, 1, dtype=float) Y = [0] * dataset_size ds = tf.data.Dataset.from_tensor_slices((X, Y)) if popdist_on: ds = ds.shard(num_instances, index=popdist.getInstanceIndex()) ds = ds.batch(micro_batch_size, drop_remainder=True) ds = ds.repeat() cfg = ipu.config.IPUConfig() if popdist_on: cfg = popdist.tensorflow.set_ipu_config( cfg, ipus_per_replica=popdist.getNumIpusPerReplica(), configure_device=True) hvd.init() else: cfg.auto_select_ipus = num_global_replicas cfg.configure_ipu_system() strategy = popdist_strategy.PopDistStrategy( ) if popdist_on else ipu.ipu_strategy.IPUStrategy() with strategy.scope(): def get_model(): input_layer = tf.keras.Input(shape=1) kernel_initializer = tf.keras.initializers.Constant(1) x = tf.keras.layers.Dense( 1, use_bias=False, kernel_initializer=kernel_initializer)(input_layer) return tf.keras.Model(input_layer, x) model = get_model() model.set_gradient_accumulation_options( gradient_accumulation_steps_per_replica=gradient_accumulation) model.build(input_shape=(micro_batch_size, 1)) if popdist_on: def gradient_normalizer(grads_and_vars): return [(grad / gradient_accumulation, var) for grad, var in grads_and_vars] else: def gradient_normalizer(grads_and_vars): return [ (grad / num_global_replicas / gradient_accumulation, var) for grad, var in grads_and_vars ] optimizer = tf.keras.optimizers.SGD( learning_rate=1.0, gradient_transformers=[gradient_normalizer]) loss_class = tf.keras.losses.MeanSquaredError loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue) loss = loss_class() micro_batches_per_weight_update = num_global_replicas * gradient_accumulation steps_per_execution = dataset_size // ( micro_batch_size * micro_batches_per_weight_update ) * micro_batches_per_weight_update model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.losses.MSE], steps_per_execution=steps_per_execution) callbacks = [ OutFeedQueueCallback(queue=loss_outfeed_queue, name='average_loss') ] if num_instances > 1: callbacks += [AllReduceMetricsCallback()] callbacks += [LoggingCallback(1)] model.fit(ds, steps_per_epoch=steps_per_execution, callbacks=callbacks) return model.get_weights()[0][0][0]
return args def benchmark_throughput(dataloader, iteration=2): for _ in range(iteration): total_sample_size = 0 start_time = time.perf_counter() for input_data, _ in tqdm(dataloader, total=len(dataloader)): total_sample_size += input_data.size()[0] elapsed_time = time.perf_counter() - start_time if popdist.isPopdistEnvSet(): elapsed_time, total_sample_size = utils.synchronize_throughput_values( elapsed_time, total_sample_size, ) iteration_throughput = total_sample_size / elapsed_time print(f"Throughput of the iteration:{iteration_throughput:0.1f} img/sec") if __name__ == '__main__': args = get_args() opts = poptorch.Options() if popdist.isPopdistEnvSet(): hvd.init() opts.Distributed.configureProcessId(popdist.getInstanceIndex(), popdist.getNumInstances()) opts.randomSeed(0) dataloader = get_data(args, opts, train=True, async_dataloader=not(args.disable_async_loading)) benchmark_throughput(dataloader)