Пример #1
0
def _setup_test_infra(world_rank, world_size):
    """distributed setup just for testing purposes"""
    os.environ['RANK'] = str(world_rank)
    os.environ['WORLD_SIZE'] = str(world_size)
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'

    set_cuda_device_id(world_rank)

    dist.init_process_group(backend='nccl', world_size=world_size, rank=world_rank)
Пример #2
0
def _setup_test_infra(world_rank, world_size):
    """distributed setup just for testing purposes"""
    os.environ["RANK"] = str(world_rank)
    os.environ["WORLD_SIZE"] = str(world_size)
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"

    set_cuda_device_id(world_rank)

    dist.init_process_group(backend="nccl",
                            world_size=world_size,
                            rank=world_rank)
    def to_ort_model(self, model, config, args):
        model_desc = self.gpt2_model_description(config.n_head,
                                                 config.vocab_size,
                                                 config.n_embd, config.n_layer,
                                                 config.n_ctx,
                                                 args.per_gpu_train_batch_size)
        learning_rate_description = self.ort_trainer_learning_rate_description(
        )

        def map_optimizer_attributes(name):
            no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
            no_decay = False
            for no_decay_key in no_decay_keys:
                if no_decay_key in name:
                    no_decay = True
                    break
            if no_decay:
                return {
                    "alpha": 0.9,
                    "beta": 0.999,
                    "lambda": 0.0,
                    "epsilon": args.adam_epsilon
                }
            else:
                return {
                    "alpha": 0.9,
                    "beta": 0.999,
                    "lambda": args.weight_decay,
                    "epsilon": args.adam_epsilon
                }

        from onnxruntime.capi._pybind_state import set_cuda_device_id, set_arena_extend_strategy, ArenaExtendStrategy
        set_arena_extend_strategy(ArenaExtendStrategy.kSameAsRequested)
        set_cuda_device_id(self.args.local_rank)

        model = ORTTrainer(
            model,
            None,
            model_desc,
            "AdamOptimizer",
            map_optimizer_attributes,
            learning_rate_description,
            args.device,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            world_rank=self.args.world_rank,
            world_size=self.args.world_size,
            use_mixed_precision=self.args.fp16,
            allreduce_post_accumulation=True,
            _opset_version=12)

        logger.info("****************************Model converted to ORT")
        return model
Пример #4
0
def setup_onnxruntime_with_mpi(args):
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    args.local_rank = comm.Get_rank()
    args.world_rank = comm.Get_rank()
    args.world_size = comm.Get_size()
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    args.n_gpu = 1

    from onnxruntime.capi._pybind_state import set_cuda_device_id
    set_cuda_device_id(args.local_rank)

    return device
Пример #5
0
def create_ort_training_session_bind_parameters(model, device, world_rank=-1, world_size=1,
                                                gradient_accumulation_steps=1):
    output_name = model.graph.output[0].name
    ort_parameters = ort.TrainingParameters()
    ort_parameters.loss_output_name = output_name
    ort_parameters.use_mixed_precision = False
    ort_parameters.world_rank = world_rank
    ort_parameters.world_size = world_size
    ort_parameters.gradient_accumulation_steps = gradient_accumulation_steps

    torch_params = {}
    output_types = {}
    for output in model.graph.output:
        output_types[output.name] = output.type.tensor_type

    for initializer in model.graph.initializer:
        torch_tensor = torch.nn.Parameter(torch.as_tensor(numpy_helper.to_array(initializer), device=device))
        delete_input_with_name(model.graph.input, initializer.name)
        model.graph.input.extend(
            [helper.make_tensor_value_info(initializer.name, initializer.data_type, initializer.dims)])

        torch_params[initializer.name] = torch_tensor

    del model.graph.initializer[:]

    ort_parameters.weights_to_train = set(torch_params.keys())

    if device.type == 'cuda' and hasattr(device, "index") and device.index is not None:
        from onnxruntime.capi._pybind_state import set_cuda_device_id
        set_cuda_device_id(device.index)
    session = ort.TrainingSession(model.SerializeToString(), ort_parameters)

    train_io_binding = session.io_binding()
    eval_io_binding = session.io_binding()

    enable_grad_accumulation = gradient_accumulation_steps > 1
    for param in torch_params.keys():
        torch_tensor = torch_params[param]

        train_io_binding.bind_input(param, torch_tensor.device.type, get_device_index(torch_tensor.device),
                                    dtype_torch_to_numpy(torch_params[param].dtype), list(torch_tensor.size()),
                                    torch_tensor.data_ptr())
        eval_io_binding.bind_input(param, torch_tensor.device.type, get_device_index(torch_tensor.device),
                                   dtype_torch_to_numpy(torch_params[param].dtype), list(torch_tensor.size()),
                                   torch_tensor.data_ptr())

        device_index = get_device_index(device)
        create_and_bind_grad_or_grad_accumulate_buffer(train_io_binding, torch_tensor, param, enable_grad_accumulation, device, device_index)

    return session, train_io_binding, eval_io_binding, output_name, torch_params, output_types
Пример #6
0
def setup_onnxruntime_with_mpi(args):
    '''
    from mpi4py import MPI
    comm = MPI.COMM_WORLD

    has_aml = 'AZ_BATCH_MASTER_NODE' in os.environ.keys() or 'AZ_BATCHAI_MPI_MASTER_NODE' in os.environ.keys()
    if not has_aml:
        print('Detected local run')
        args.local_rank = comm.Get_rank() % torch.cuda.device_count()
        args.world_rank = comm.Get_rank()
        args.world_size = comm.Get_size()

        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1

    else:
        print('Detected Azure batch run')
        set_environment_variables_for_nccl_backend(get_local_size() == get_global_size(), IB = args.use_ib)
        args.local_rank = get_local_rank()
        args.local_size = get_local_size()
        args.world_rank = get_world_rank()
        args.world_size = get_global_size()

        print('Local rank: {}'.format(args.local_rank))
        print('Local size: {}'.format(args.local_size))
        print('World rank: {}'.format(args.world_rank))
        print('World size: {}'.format(args.world_size))
        print('CUDA device: {}'.format(args.local_rank))

        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1

        torch.distributed.init_process_group(backend='nccl')
    '''

    #device = torch.device("cuda", get_local_rank())
    device = torch.device("cuda", args.distributed_rank)

    from onnxruntime.capi._pybind_state import set_cuda_device_id
    #set_cuda_device_id(get_local_rank())
    set_cuda_device_id(args.distributed_rank)

    from onnxruntime.capi._pybind_state import set_arena_extend_strategy, ArenaExtendStrategy
    set_arena_extend_strategy(ArenaExtendStrategy.kSameAsRequested)

    return device
def test_single_precision_adasum_on_gpu():
    # Common setup
    world_rank = get_mpi_context_world_rank()
    world_size = get_mpi_context_world_size()
    set_cuda_device_id(world_rank)
    device = "cuda:" + str(world_rank)
    opts = orttrainer.ORTTrainerOptions({
        "debug": {
            "deterministic_compute": True
        },
        "device": {
            "id": device,
        },
        "distributed": {
            "world_rank": world_rank,
            "world_size": world_size,
            "enable_adasum": True,
        },
    })
    _run_adasum_tests(opts)
def test_single_precision_adasum_on_gpu():
    # Common setup
    world_rank = get_mpi_context_world_rank()
    world_size = get_mpi_context_world_size()
    set_cuda_device_id(world_rank)
    device = 'cuda:' + str(world_rank)
    opts = orttrainer.ORTTrainerOptions({
        'debug': {
            'deterministic_compute': True
        },
        'device': {
            'id': device,
        },
        'distributed': {
            'world_rank': world_rank,
            'world_size': world_size,
            'enable_adasum': True,
        }
    })
    _run_adasum_tests(opts)
Пример #9
0
        return results


if __name__ == "__main__":
    local_rank = get_mpi_context_local_rank()
    world_size = get_mpi_context_world_size()
    if world_size > 1:
        # mpi launch
        logger.warning("mpirun launch, local_rank / world_size: %s : % s",
                       local_rank, world_size)

        # TrainingArguments._setup_devices will call torch.distributed.init_process_group(backend="nccl")
        # pytorch expects following environment settings (which would be set if launched with torch.distributed.launch).

        os.environ['RANK'] = str(local_rank)
        os.environ['WORLD_SIZE'] = str(world_size)
        os.environ['MASTER_ADDR'] = '127.0.0.1'
        os.environ['MASTER_PORT'] = '29500'

        from onnxruntime.capi._pybind_state import set_cuda_device_id
        set_cuda_device_id(local_rank)

        test = ORTGlueTest()
        test.setUp()
        test.local_rank = local_rank
        test.world_size = world_size
        test.test_bert_with_mrpc()
    else:
        unittest.main()
Пример #10
0
def main():
#Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')

    parser.add_argument('--use-ort', action='store_true', default=False,
                        help='to use onnxruntime as training backend')

    parser.add_argument('--use-ort-trainer', action='store_true', default=False,
                        help='to use onnxruntime as training backend')

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    kwargs = {'num_workers': 0, 'pin_memory': True}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])),
        batch_size=args.test_batch_size, shuffle=True, **kwargs)


    comm = MPI.COMM_WORLD
    args.local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) if ('OMPI_COMM_WORLD_LOCAL_RANK' in os.environ) else 0
    args.world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) if ('OMPI_COMM_WORLD_RANK' in os.environ) else 0
    args.world_size=comm.Get_size()
    torch.cuda.set_device(args.local_rank)
    if use_cuda:
        device = torch.device("cuda", args.local_rank)
    else:
        device = torch.device("cpu")
    args.n_gpu = 1
    set_cuda_device_id(args.local_rank)

    input_size = 784
    hidden_size = 500
    num_classes = 10
    model = NeuralNet(input_size, hidden_size, num_classes)

    model_desc = mnist_model_description()
    if args.use_ort_trainer:
        # use log_interval as gradient accumulate steps
        trainer = ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, IODescription('Learning_Rate', [1,], torch.float32), device, 1, None,
        args.world_rank, args.world_size, use_mixed_precision=False, allreduce_post_accumulation = True)
        print('\nBuild ort model done.')

        for epoch in range(1, args.epochs + 1):
            train_with_trainer(args, trainer, device, train_loader, epoch)
            import pdb
            test_with_trainer(args, trainer, device, test_loader)
    else:
        model = ORTModel(model, my_loss, model_desc, device, None, args.world_rank, args.world_size)
        print('\nBuild ort model done.')

        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

        for epoch in range(1, args.epochs + 1):
            train_with_model(args, model, device, train_loader, optimizer, epoch)
Пример #11
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument("--batch-size",
                        type=int,
                        default=64,
                        metavar="N",
                        help="input batch size for training (default: 64)")
    parser.add_argument("--test-batch-size",
                        type=int,
                        default=1000,
                        metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs",
                        type=int,
                        default=10,
                        metavar="N",
                        help="number of epochs to train (default: 10)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.01,
                        metavar="LR",
                        help="learning rate (default: 0.01)")
    parser.add_argument("--no-cuda",
                        action="store_true",
                        default=False,
                        help="disables CUDA training")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10,
        metavar="N",
        help="how many batches to wait before logging training status",
    )

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    kwargs = {"num_workers": 0, "pin_memory": True}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        ),
        batch_size=args.batch_size,
        shuffle=True,
        **kwargs,
    )
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        ),
        batch_size=args.test_batch_size,
        shuffle=True,
        **kwargs,
    )

    comm = MPI.COMM_WORLD
    args.local_rank = (int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) if
                       ("OMPI_COMM_WORLD_LOCAL_RANK" in os.environ) else 0)
    args.world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) if (
        "OMPI_COMM_WORLD_RANK" in os.environ) else 0
    args.world_size = comm.Get_size()
    if use_cuda:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        set_cuda_device_id(args.local_rank)
    else:
        device = torch.device("cpu")

    input_size = 784
    hidden_size = 500
    num_classes = 10
    model = NeuralNet(input_size, hidden_size, num_classes)

    model_desc = mnist_model_description()
    # use log_interval as gradient accumulate steps
    trainer = ORTTrainer(
        model,
        my_loss,
        model_desc,
        "SGDOptimizer",
        None,
        IODescription(
            "Learning_Rate",
            [
                1,
            ],
            torch.float32,
        ),
        device,
        1,
        args.world_rank,
        args.world_size,
        use_mixed_precision=False,
        allreduce_post_accumulation=True,
    )
    print("\nBuild ort model done.")

    for epoch in range(1, args.epochs + 1):
        train_with_trainer(args, trainer, device, train_loader, epoch)
        import pdb

        test_with_trainer(args, trainer, device, test_loader)