Python getInstanceIndex示例，popdist.getInstanceIndex Python示例

示例#1

0

显示文件

 def __iter__(self):
     worker_info = torch.utils.data.get_worker_info()
     if worker_info is not None:
         if popdist.isPopdistEnvSet():
             self.worker_id = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex(
             )
             self.shard = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex(
             ), worker_info.num_workers * popdist.getNumInstances()
         else:
             self.worker_id = worker_info.id
             self.shard = worker_info.id, worker_info.num_workers
     else:
         self.shard = None
     self.reset()
     if self.shuffle:
         np.random.shuffle(self.files)
     return self

示例#2

0

显示文件

文件： args.py 项目： graphcore/examples

def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replication_factor:
        print(f"The number of replicas is overridden by PopRun. "
              f"The new value is {popdist.getNumTotalReplicas()}.")
    args.replication_factor = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()

示例#3

0

显示文件

def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replicas:
        logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.")
    args.replicas = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
    args.popdist_local_rank = hvd.local_rank()

示例#4

0

显示文件

文件： run_pretraining.py 项目： graphcore/examples

def set_distribution_defaults(opts):
    if opts['use_popdist']:
        opts['distributed_worker_count'] = popdist.getNumInstances()
        opts['distributed_worker_index'] = popdist.getInstanceIndex()
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0

    if opts['distributed_worker_index'] != 0 and not opts['log_all_workers']:
        logger.setLevel(logging.ERROR)

示例#5

0

显示文件

文件： validation.py 项目： graphcore/examples

def set_distribution_defaults(opts):
    if opts['use_popdist']:
        opts['distributed_worker_count'] = popdist.getNumInstances()
        opts['distributed_worker_index'] = popdist.getInstanceIndex()
        opts['distributed_cluster'] = None

        opts['summary_str'] += 'Popdist\n'
        opts['summary_str'] += ' Process count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Process index: {distributed_worker_index}\n'
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0
        opts['distributed_cluster'] = None

示例#6

0

显示文件

文件： run_pretraining.py 项目： WN1695173791/examples

def set_distribution_defaults(opts):

    if opts['use_popdist']:
        opts['distributed_worker_count'] = popdist.getNumInstances()
        opts['distributed_worker_index'] = popdist.getInstanceIndex()
        opts['summary_str'] += 'Popdist\n'
        opts['summary_str'] += ' Process count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Process index: {distributed_worker_index}\n'
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0

    if opts['distributed_worker_index'] != 0 and not opts['log_all_workers']:
        logger.setLevel(logging.ERROR)

示例#7

0

显示文件

    def get_dataset_from_directory(ds_path: str, split: str, seed=42):

        if not os.path.exists(ds_path):
            DataGenerator.logger.error(f'{ds_path} does not exist')
            raise NameError(f'Directory {ds_path} does not exist')

        builder = tfds.folder_dataset.ImageFolder(ds_path)

        info_ds = builder.info

        ds = builder.as_dataset(as_supervised=True, split=split)

        if not isinstance(ds, tf.data.Dataset):
            raise UnsupportedFormat(
                f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}'
            )

        num_examples = DataGenerator.evaluate_size_dataset(ds)

        iterator = iter(ds)
        first_elem = iterator.get_next()

        if len(first_elem[0].shape) != 3:
            raise DimensionError(
                f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
            )

        img_shape = first_elem[0].shape

        num_classes = -1

        if len(info_ds.supervised_keys) == 2:
            label = info_ds.supervised_keys[1]
            num_classes = info_ds.features[label].num_classes
        else:
            raise UnsupportedFormat(
                f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}'
            )

        print(
            f'img shape {img_shape} number of examples {num_examples} number of classes {num_classes}'
        )

        if popdist.getNumInstances() > 1:
            ds = ds.shard(num_shards=popdist.getNumInstances(),
                          index=popdist.getInstanceIndex())

        return ds, img_shape, num_examples, num_classes

示例#8

0

显示文件

    def get_imagenet(path: str,
                     split: str,
                     cycle_length: int = 4,
                     block_length: int = 4):

        # The path is the one of dataset under TFRecord format
        if not os.path.exists(path):
            DataGenerator.logger.error(f'{path} does not exist')
            raise NameError(f'Directory {path} does not exist')

        if split == 'train':
            filenames = glob.glob1(path, 'train*')
            if len(filenames) != 1024:
                DataGenerator.logger.error(
                    f'train directory should contain 1024 tf-record files but it contains {len(filenames)} instead'
                )
                raise ValueError(
                    f'train directory should contain 1024 files but it contains {len(filenames)} instead'
                )

        else:
            filenames = glob.glob1(path, 'validation*')
            if len(filenames) != 128:
                DataGenerator.logger.error(
                    f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead'
                )
                raise ValueError(
                    f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead'
                )

        num_files = len(filenames)

        filenames = list(
            map(lambda filename: os.path.join(path, filename), filenames))
        DataGenerator.logger.debug(f'filenames = {filenames}')
        ds = tf.data.Dataset.from_tensor_slices(filenames)

        if split == 'train':
            # Shuffle the input files
            ds = ds.shuffle(buffer_size=num_files)

        if popdist.getNumInstances() > 1:
            ds = ds.shard(num_shards=popdist.getNumInstances(),
                          index=popdist.getInstanceIndex())

        ds = ds.interleave(tf.data.TFRecordDataset,
                           cycle_length=cycle_length,
                           block_length=block_length,
                           num_parallel_calls=cycle_length)

        DataGenerator.logger.info(f'dataset = {ds}')

        num_examples = IMAGENET_DS_SIZE[split]

        DataGenerator.logger.info(f'number of examples {num_examples}')

        iterator = iter(ds)
        first_elem = iterator.get_next()

        feature, _ = imagenet_processing.parse_record(first_elem, True,
                                                      tf.float32)

        if len(feature.shape) != 3:
            raise DimensionError(
                f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
            )

        num_classes = 1000
        ds = ds.cache()

        return ds, feature.shape, num_examples, num_classes

示例#9

0

显示文件

def logger(msg):
    if not popdist.isPopdistEnvSet() or popdist.getInstanceIndex() == 0:
        logging.info(msg)

示例#10

0

显示文件

            logging.warning(
                f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) '
                f'does not match the config (=={num_replicas}). Poprun will override the config.'
            )
            num_replicas = popdist.getNumTotalReplicas()

        max_threads_per_instance = os.cpu_count() // popdist.getNumInstances()
        if pipeline_num_parallel > max_threads_per_instance:
            logging.warning(
                f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads '
                f'divided by the number of instances,  Poprun will override the config. '
            )
            # Limit the maximal number of threads to the total of physical threads divided by the number of instances
            pipeline_num_parallel = max_threads_per_instance

        if popdist.getInstanceIndex() != 0:
            checkpoints = False
            log_to_wandb = False

    # when neither option is specified, assume gradient accumulation count 1
    if gradient_accumulation_count is None and global_batch_size is None:
        gradient_accumulation_count = 1

    if recomputation and not len(pipeline_splits):
        raise ValueError('Recomputation requires a pipelined model. '
                         'Make sure "--pipeline-splits" is defined')

    if logs_per_epoch < 0:
        raise ValueError(
            f'--logs-per-epoch should be non-negative (>=0), it is {logs_per_epoch}'
        )

示例#11

0

显示文件

def replicated_tensor_sharding_core():
    parser = argparse.ArgumentParser(description="Parse launch parameters.")
    parser.add_argument("--tensors", nargs="*")
    parser.add_argument("--optim", nargs="?")
    parser.add_argument("--tmpdir", nargs="?")
    parser.add_argument("--filename", nargs="?")
    parser.add_argument("--compute_batch", nargs="?")
    args = parser.parse_args(sys.argv[2:])

    ipus_per_replica = 1

    batches_per_step = 10
    accumulation_factor = 4
    compute_batch = int(args.compute_batch)
    hidden_size = 4
    reduction = popart.ReductionType.Sum

    deviceInfo = popdist.popart.getDevice(ipus_per_replica)
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    builder = popart.Builder()

    np.random.seed(12321)
    weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32)

    input_data = []
    label_data = []

    for i in range(
            0, batches_per_step * num_local_replicas * accumulation_factor *
            compute_batch):
        np.random.seed(popdist.getInstanceIndex() +
                       i * popdist.getNumInstances())
        input_data += [np.random.rand(hidden_size).astype(np.float32)]
        label_data += [np.random.randint(0, hidden_size, size=1)]

    input_data = np.concatenate(input_data)
    label_data = np.concatenate(label_data)

    builder = popart.Builder()

    d0 = builder.addInputTensor(
        popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0")
    l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )),
                                "l0")

    data = {}

    data[d0] = input_data.reshape((batches_per_step, num_local_replicas,
                                   accumulation_factor, compute_batch, -1))

    w0 = builder.addInitializedInputTensor(weight_data, 'weight0')
    x = builder.aiOnnx.matmul([d0, w0])

    x = builder.aiOnnx.softmax([x])

    data[l0] = label_data.reshape((batches_per_step,
                    num_local_replicas,
                    accumulation_factor,
                    compute_batch,
                    -1))\
                .astype(np.uint32)
    loss = builder.aiGraphcore.nllloss([x, l0],
                                       reduction=reduction,
                                       debugContext='loss')

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(
        batches_per_step,
        {av: popart.AnchorReturnType("ALL")
         for av in [x, loss]})

    opts = popart.SessionOptions()
    if accumulation_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accumulation_factor
    opts.explicitRecomputation = True
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    # Let popdist handle distributed settings, such as:
    # opts.enableDistributedReplicatedGraphs
    # opts.globalReplicaOffset
    # opts.globalReplicationFactor
    popdist.popart.configureSessionOptions(opts)

    for tensor in ["weight", "optimizerState", "accumulator"]:
        userOption = tensor + "TensorLocationSettings"
        print(
            f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}"
        )
        locationSetting = getattr(opts, userOption)
        locationSetting.minElementsForOffChip = 0
        locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas
        if tensor in args.tensors:
            locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On
        if num_total_replicas > num_local_replicas:
            locationSetting.location.shardingDomain = popart.CommGroup(
                popart.CommGroupType.Consecutive, num_local_replicas)
        setattr(opts, userOption, locationSetting)

    if args.optim == "Adam":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, False),
            },
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.LambNoBias)
    if args.optim == "SGD":
        optimizer = popart.ConstSGD(0.01)

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     deviceInfo=deviceInfo,
                                     userOptions=opts,
                                     loss=loss,
                                     optimizer=optimizer)

    session.prepareDevice()

    session.weightsFromHost()

    anchors = session.initAnchorArrays()

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    tmp_path = Path(args.tmpdir)
    tmp_path.mkdir(parents=True, exist_ok=True)
    file_path = str(tmp_path / args.filename)
    session.modelToHost(file_path)
    post_proto = onnx.load(file_path)

示例#12

0

显示文件

    def ipu_prog(num_replicas, gradient_accumulation):
        import logging
        import sys
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
        popdist_on = popdist.isPopdistEnvSet()

        num_global_replicas = popdist.getNumTotalReplicas(
        ) if popdist_on else num_replicas
        num_instances = popdist.getNumInstances() if popdist_on else 1

        dataset_size = global_batch_size = 16
        micro_batch_size = int(global_batch_size / num_global_replicas /
                               gradient_accumulation)

        X = np.arange(1, dataset_size + 1, 1, dtype=float)
        Y = [0] * dataset_size
        ds = tf.data.Dataset.from_tensor_slices((X, Y))
        if popdist_on:
            ds = ds.shard(num_instances, index=popdist.getInstanceIndex())
        ds = ds.batch(micro_batch_size, drop_remainder=True)
        ds = ds.repeat()

        cfg = ipu.config.IPUConfig()
        if popdist_on:
            cfg = popdist.tensorflow.set_ipu_config(
                cfg,
                ipus_per_replica=popdist.getNumIpusPerReplica(),
                configure_device=True)
            hvd.init()
        else:
            cfg.auto_select_ipus = num_global_replicas
        cfg.configure_ipu_system()

        strategy = popdist_strategy.PopDistStrategy(
        ) if popdist_on else ipu.ipu_strategy.IPUStrategy()

        with strategy.scope():

            def get_model():
                input_layer = tf.keras.Input(shape=1)
                kernel_initializer = tf.keras.initializers.Constant(1)
                x = tf.keras.layers.Dense(
                    1, use_bias=False,
                    kernel_initializer=kernel_initializer)(input_layer)
                return tf.keras.Model(input_layer, x)

            model = get_model()
            model.set_gradient_accumulation_options(
                gradient_accumulation_steps_per_replica=gradient_accumulation)
            model.build(input_shape=(micro_batch_size, 1))

            if popdist_on:

                def gradient_normalizer(grads_and_vars):
                    return [(grad / gradient_accumulation, var)
                            for grad, var in grads_and_vars]
            else:

                def gradient_normalizer(grads_and_vars):
                    return [
                        (grad / num_global_replicas / gradient_accumulation,
                         var) for grad, var in grads_and_vars
                    ]

            optimizer = tf.keras.optimizers.SGD(
                learning_rate=1.0, gradient_transformers=[gradient_normalizer])

            loss_class = tf.keras.losses.MeanSquaredError
            loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
            loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue)
            loss = loss_class()

            micro_batches_per_weight_update = num_global_replicas * gradient_accumulation
            steps_per_execution = dataset_size // (
                micro_batch_size * micro_batches_per_weight_update
            ) * micro_batches_per_weight_update

            model.compile(optimizer=optimizer,
                          loss=loss,
                          metrics=[tf.keras.losses.MSE],
                          steps_per_execution=steps_per_execution)

            callbacks = [
                OutFeedQueueCallback(queue=loss_outfeed_queue,
                                     name='average_loss')
            ]
            if num_instances > 1:
                callbacks += [AllReduceMetricsCallback()]
            callbacks += [LoggingCallback(1)]

            model.fit(ds,
                      steps_per_epoch=steps_per_execution,
                      callbacks=callbacks)

            return model.get_weights()[0][0][0]

示例#13

0

显示文件

文件： host_benchmark.py 项目： graphcore/examples

    return args


def benchmark_throughput(dataloader, iteration=2):
    for _ in range(iteration):
        total_sample_size = 0
        start_time = time.perf_counter()
        for input_data, _ in tqdm(dataloader, total=len(dataloader)):
            total_sample_size += input_data.size()[0]
        elapsed_time = time.perf_counter() - start_time

        if popdist.isPopdistEnvSet():
            elapsed_time, total_sample_size = utils.synchronize_throughput_values(
                elapsed_time,
                total_sample_size,
            )

        iteration_throughput = total_sample_size / elapsed_time
        print(f"Throughput of the iteration:{iteration_throughput:0.1f} img/sec")


if __name__ == '__main__':
    args = get_args()
    opts = poptorch.Options()
    if popdist.isPopdistEnvSet():
        hvd.init()
        opts.Distributed.configureProcessId(popdist.getInstanceIndex(), popdist.getNumInstances())
    opts.randomSeed(0)
    dataloader = get_data(args, opts, train=True, async_dataloader=not(args.disable_async_loading))
    benchmark_throughput(dataloader)