Exemplo n.º 1
0
def test_cpu_adam_opt(model_size):
    from deepspeed.ops.adam import DeepSpeedCPUAdam
    device = 'cpu'
    rng_state = torch.get_rng_state()
    param = torch.nn.Parameter(torch.randn(model_size, device=device))
    torch.set_rng_state(rng_state)
    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
    torch.set_rng_state(rng_state)
    param2_data = torch.randn(model_size, device=device).cuda()
    param2 = torch.nn.Parameter(param2_data)

    optimizer1 = torch.optim.AdamW([param1])
    optimizer2 = FusedAdam([param2])
    optimizer = DeepSpeedCPUAdam([param])

    for i in range(10):
        rng_state = torch.get_rng_state()
        param.grad = torch.randn(model_size, device=device)
        torch.set_rng_state(rng_state)
        param1.grad = torch.randn(model_size, device=device)
        torch.set_rng_state(rng_state)
        param2.grad = torch.randn(model_size, device=device).cuda()

        optimizer.step()
        optimizer2.step()
        optimizer1.step()

    check_equal(param, param1, atol=1e-2, verbose=True)
    check_equal(param, param2.cpu(), atol=1e-2, verbose=True)
Exemplo n.º 2
0
def test_cpu_adam_gpu_error():
    model_size = 64
    from deepspeed.ops.adam import DeepSpeedCPUAdam
    device = 'cuda:0'
    param = torch.nn.Parameter(torch.randn(model_size, device=device))
    optimizer = DeepSpeedCPUAdam([param])

    param.grad = torch.randn(model_size, device=device)
    with pytest.raises(AssertionError):
        optimizer.step()
Exemplo n.º 3
0
    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]
        params_decay = [
            p for n, p in self.named_parameters()
            if not any(nd in n for nd in no_decay)
        ]
        params_nodecay = [
            p for n, p in self.named_parameters()
            if any(nd in n for nd in no_decay)
        ]
        optim_groups = [
            {
                "params": params_decay,
                "weight_decay": self.hparams.weight_decay
            },
            {
                "params": params_nodecay,
                "weight_decay": 0.0
            },
        ]
        # todo: need to enable deepspeed cpu adam only if offloading

        if self.deepspeed_offload:
            return DeepSpeedCPUAdam(optim_groups,
                                    lr=self.hparams.learning_rate,
                                    betas=self.hparams.betas)
        return FusedAdam(optim_groups,
                         lr=self.hparams.learning_rate,
                         betas=self.hparams.betas)
Exemplo n.º 4
0
def test_cpu_adam_opt(model_size):
    device = 'cpu'
    rng_state = torch.get_rng_state()
    param = torch.nn.Parameter(torch.randn(model_size, device=device))
    torch.set_rng_state(rng_state)
    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))

    optimizer1 = torch.optim.Adam([param1])
    optimizer = DeepSpeedCPUAdam([param])

    for i in range(10):
        rng_state = torch.get_rng_state()
        param.grad = torch.randn(model_size, device=device)
        torch.set_rng_state(rng_state)
        param1.grad = torch.randn(model_size, device=device)

        optimizer.step()
        optimizer1.step()

    check_equal(param, param1, atol=1e-2, verbose=True)
Exemplo n.º 5
0
def test_cpu_adam_opt(dtype, model_size):
    if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
        pytest.skip("cpu-adam with half precision not supported on AMD CPUs")

    from deepspeed.ops.adam import DeepSpeedCPUAdam
    device = 'cpu'
    rng_state = torch.get_rng_state()
    param = torch.nn.Parameter(
        torch.randn(model_size, device=device).to(dtype))
    torch.set_rng_state(rng_state)
    param1_data = torch.randn(model_size, device=device)
    param1 = torch.nn.Parameter(param1_data)
    torch.set_rng_state(rng_state)
    param2_data = torch.randn(model_size, device=device).to(dtype).cuda()
    param2 = torch.nn.Parameter(param2_data)

    optimizer1 = torch.optim.AdamW([param1])
    optimizer2 = FusedAdam([param2])
    optimizer = DeepSpeedCPUAdam([param])

    for i in range(10):
        rng_state = torch.get_rng_state()
        param.grad = torch.randn(model_size, device=device).to(dtype)
        torch.set_rng_state(rng_state)
        param1.grad = torch.randn(model_size, device=device)
        torch.set_rng_state(rng_state)
        param2.grad = torch.randn(model_size, device=device).to(dtype).cuda()

        optimizer.step()
        optimizer2.step()
        optimizer1.step()
    tolerance = param1.float().norm().detach().numpy() * 1e-2
    check_equal(param.float().norm(),
                param1.float().norm(),
                atol=tolerance,
                verbose=True)
    check_equal(param.float().norm(),
                param2.float().cpu().norm(),
                atol=tolerance,
                verbose=True)
Exemplo n.º 6
0
def get_model_tokenizer_optimizer(args):
    model, tokenizer, _ = build_model(args)

    model.half()
    model.cuda(args.local_rank)

    # XXX: all change to model parameters
    #      (e.g. add_special_tokens)
    #      must happen before DDP !!
    model = DDP(model,
                device_ids=[args.local_rank],
                output_device=args.local_rank)

    model_obj = model.module

    if args.freeze_body:
        model_obj.transformer.requires_grad_(False)

        model_obj.transformer.wpe.requires_grad_(True)
        model_obj.transformer.emb_norm.requires_grad_(True)
        model_obj.lm_head.requires_grad_(True)
        params = [
            dict(params=v) for v in [
                # wte is tie with lm_head, no need run requires_grad_
                # don't put wte in optim, params can't dup,
                # and autodiff will calc grads two times on params in lm_head
                # model.module.transformer.wte.parameters(),
                model_obj.transformer.wpe.parameters(),
                model_obj.transformer.emb_norm.parameters(),
                model_obj.lm_head.parameters()
            ]
        ]
    else:
        model.requires_grad_(True)
        params = model_obj.parameters()

    optimizer = DeepSpeedCPUAdam(params, lr=args.lr, weight_decay=0.01)

    return model, tokenizer, optimizer
Exemplo n.º 7
0
def main(arch="bert-base-uncased", config="gpu.json"):
    # Reference:
    #
    #     * https://github.com/huggingface/nlp/blob/master/notebooks/Overview.ipynb
    with open(config) as fin:
        config_params = json.load(fin)

    dataset = nlp.load_dataset('glue', "sst2")
    print(set([x['label'] for x in dataset["train"]]))

    tokenizer = BertTokenizerFast.from_pretrained(arch)

    # Format our dataset to outputs torch.Tensor to train a pytorch model
    columns = ['input_ids', 'token_type_ids', 'attention_mask', "label"]
    for subset in ("train", "validation"):
        dataset[subset] = dataset[subset].map(partial(convert_to_features,
                                                      tokenizer),
                                              batched=True)
        dataset[subset].set_format(type='torch', columns=columns)

    print(tokenizer.decode(dataset['train'][6]["input_ids"].numpy()))
    print(dataset['train'][0]["attention_mask"])

    valid_idx, test_idx = train_test_split(list(
        range(len(dataset["validation"]))),
                                           test_size=0.5,
                                           random_state=42)

    train_dict = {
        "input_ids": dataset['train']["input_ids"],
        "attention_mask": dataset['train']["attention_mask"],
        "token_type_ids": dataset['train']["token_type_ids"],
        "label": dataset['train']["label"]
    }
    valid_dict = {
        "input_ids": dataset['validation']["input_ids"][valid_idx],
        "attention_mask": dataset['validation']["attention_mask"][valid_idx],
        "token_type_ids": dataset['validation']["token_type_ids"][valid_idx],
        "label": dataset['validation']["label"][valid_idx]
    }
    test_dict = {
        "input_ids": dataset['validation']["input_ids"][test_idx],
        "attention_mask": dataset['validation']["attention_mask"][test_idx],
        "token_type_ids": dataset['validation']["token_type_ids"][test_idx],
        "label": dataset['validation']["label"][test_idx]
    }

    # Instantiate a PyTorch Dataloader around our dataset
    train_loader = torch.utils.data.DataLoader(
        SST2Dataset(train_dict),
        batch_size=config_params["train_batch_size"],
        shuffle=True)
    valid_loader = torch.utils.data.DataLoader(
        SST2Dataset(valid_dict),
        batch_size=config_params["train_batch_size"],
        drop_last=False)
    test_loader = torch.utils.data.DataLoader(
        SST2Dataset(test_dict),
        batch_size=config_params["train_batch_size"],
        drop_last=False)

    model = BertForSequenceClassification.from_pretrained(arch)
    # torch.nn.init.kaiming_normal_(model.classifier.weight)
    # torch.nn.init.constant_(model.classifier.bias, 0)
    # torch.nn.init.kaiming_normal_(model.bert.pooler.dense.weight)
    # torch.nn.init.constant_(model.bert.pooler.dense.bias, 0);

    args = Object()
    setattr(args, "local_rank", 0)
    setattr(args, "deepspeed_config", config)
    if config[:3] == "cpu":
        if "optimizer" in config_params:
            model, optimizer, _, _ = deepspeed.initialize(
                args=args, model=model, model_parameters=model.parameters())
        else:
            from deepspeed.ops.adam import DeepSpeedCPUAdam
            optimizer = DeepSpeedCPUAdam(model.parameters(), lr=2e-5)
            model, optimizer, _, _ = deepspeed.initialize(
                args=args,
                model=model,
                model_parameters=model.parameters(),
                optimizer=optimizer)
    else:
        model, optimizer, _, _ = deepspeed.initialize(
            args=args,
            model=model,
            model_parameters=model.parameters()
            # optimizer=optimizer
        )

    total_steps = len(train_loader) * 3

    # checkpoints = CheckpointCallback(
    #     keep_n_checkpoints=1,
    #     checkpoint_dir=CACHE_DIR / "model_cache/",
    #     monitor_metric="accuracy"
    # )
    lr_durations = [int(total_steps * 0.2), int(np.ceil(total_steps * 0.8))]
    break_points = [0] + list(np.cumsum(lr_durations))[:-1]
    callbacks = [
        MovingAverageStatsTrackerCallback(avg_window=len(train_loader) // 8,
                                          log_interval=len(train_loader) //
                                          10),
        LearningRateSchedulerCallback(
            MultiStageScheduler([
                LinearLR(optimizer, 0.01, lr_durations[0]),
                CosineAnnealingScheduler(optimizer, lr_durations[1])
            ],
                                start_at_epochs=break_points)),
        # checkpoints
    ]

    bot = SST2Bot(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        clip_grad=10.,
        optimizer=optimizer,
        echo=True,
        criterion=torch.nn.CrossEntropyLoss(),
        callbacks=callbacks,
        pbar=False,
        use_tensorboard=False,
        # use_amp=APEX_AVAILABLE,
        metrics=(Top1Accuracy(), ))

    print(total_steps)
    bot.train(total_steps=total_steps,
              checkpoint_interval=len(train_loader) // 2)
    # bot.load_model(checkpoints.best_performers[0][1])
    # checkpoints.remove_checkpoints(keep=0)

    # TARGET_DIR = CACHE_DIR / "sst2_bert_uncased"
    # TARGET_DIR.mkdir(exist_ok=True)
    # bot.model.save_pretrained(TARGET_DIR)
    bot.eval(valid_loader)

    bot.eval(test_loader)
Exemplo n.º 8
0
 def configure_optimizers(self):
     return DeepSpeedCPUAdam(self.parameters())
Exemplo n.º 9
0
import torch
from deepspeed.ops.adam import DeepSpeedCPUAdam
import time

device = 'cpu'
model_size = 1 * 1024**3
group_size = [model_size, 274432]

param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
optimizer = DeepSpeedCPUAdam(param)
#torch.set_num_threads(128)
for i, p in enumerate(param):
    p.grad = torch.ones(group_size[i], device=device)
#param.grad = torch.ones(model_size, device=device)
avg = 0
for i in range(100):
    start = time.time()
    optimizer.step()
    stop = time.time()
    avg += (stop - start)
    for i, p in enumerate(param):
        p.grad = torch.ones(group_size[i], device=device) * 2
    #param.grad = torch.ones(model_size, device=device) * 2
print("Elapsed Time is ", avg / 100)
Exemplo n.º 10
0
    def configure_optimizers(self):
        base_parameters = []
        lm_decay_parameters = []
        lm_no_decay_parameters = []

        if self.hparams.optim_params.optimizer == "radam":
            for parameter_name, parameter in self.named_parameters():
                if "transformer" not in parameter_name:
                    base_parameters.append(parameter)
                elif not any(v in parameter_name for v in ["bias", "LayerNorm.weight"]):
                    lm_decay_parameters.append(parameter)
                else:
                    lm_no_decay_parameters.append(parameter)

            optimizer_params = [
                {
                    "params": base_parameters,
                    "weight_decay": self.hparams.optim_params.weight_decay,
                },
                {
                    "params": lm_decay_parameters,
                    "lr": self.hparams.optim_params.lm_lr,
                    "weight_decay": self.hparams.optim_params.lm_weight_decay,
                },
                {
                    "params": lm_no_decay_parameters,
                    "lr": self.hparams.optim_params.lm_lr,
                    "weight_decay": 0.0,
                },
            ]

            optimizer = RAdam(optimizer_params, lr=self.hparams.optim_params.lr)
        elif self.hparams.optim_params.optimizer == "fuseadam":
            try:
                from deepspeed.ops.adam import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install DeepSpeed (`pip install deepspeed`) to use FuseAdam optimizer."
                )

            optimizer = FusedAdam(self.parameters())
        elif self.hparams.optim_params.optimizer == "deepspeedcpuadam":
            try:
                from deepspeed.ops.adam import DeepSpeedCPUAdam
            except ImportError:
                raise ImportError(
                    "Please install DeepSpeed (`pip install deepspeed`) to use DeepSpeedCPUAdam optimizer."
                )

            optimizer = DeepSpeedCPUAdam(self.parameters())
        elif self.hparams.optim_params.optimizer == "adafactor":
            optimizer = Adafactor(
                self.parameters(),
                scale_parameter=False,
                relative_step=False,
                warmup_init=False,
                lr=self.hparams.optim_params.lm_lr,
            )
        else:
            raise ValueError(f"Unknown optimizer {self.hparams.optim_params.optimizer}")
        return optimizer
Exemplo n.º 11
0
import torch
from deepspeed.ops.adam import DeepSpeedCPUAdam
import time

device = 'cpu'
model_size = 1 * 1024**3
param = torch.nn.Parameter(torch.ones(model_size, device=device))
param_fp16 = torch.nn.Parameter(
    torch.ones(model_size, dtype=torch.half, device='cuda:0'))

optimizer = DeepSpeedCPUAdam([param])
#torch.set_num_threads(128)
param.grad = torch.ones(model_size, device=device)
avg = 0
for i in range(100):
    start = time.time()
    optimizer.step(fp16_param_groups=[param_fp16])
    stop = time.time()
    avg += (stop - start)
    param.grad = torch.ones(model_size, device=device) * 2
print("Elapsed Time is ", avg / 100)
Exemplo n.º 12
0
    def __init__(self, cfg: DictConfig):
        self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0
        self.distributed_factor = cfg.distributed_world_size or 1

        logger.info("***** Initializing components for training *****")

        # if model file is specified, encoder parameters from saved state should be used for initialization
        model_file = get_model_file(cfg, cfg.checkpoint_file_name)
        saved_state = None
        if model_file:
            saved_state = load_states_from_checkpoint(model_file)
            set_cfg_params_from_state(saved_state.encoder_params, cfg)

        tensorizer, model, optimizer = init_biencoder_components(
            cfg.encoder.encoder_model_type, cfg
        )

        if cfg.deepspeed:
            model.half()

            # XXX
           #no_decay = ["bias", "LayerNorm.weight"]
           #
           #optimizer_grouped_parameters = [
           #    {
           #        "params": [
           #            p
           #            for n, p in model.named_parameters()
           #            if not any(nd in n for nd in no_decay)
           #        ],
           #        "weight_decay": cfg.train.weight_decay,
           #    },
           #    {
           #        "params": [
           #            p
           #            for n, p in model.named_parameters()
           #            if any(nd in n for nd in no_decay)
           #        ],
           #        "weight_decay": 0.0,
           #    },
           #]
    
            optimizer = DeepSpeedCPUAdam(optimizer.param_groups, lr=cfg.train.learning_rate, 
                    weight_decay=cfg.train.weight_decay)

        model, optimizer = setup_for_distributed_mode(
            model,
            optimizer,
            cfg.device,
            cfg.n_gpu,
            cfg.local_rank,
            cfg.fp16,
            cfg.fp16_opt_level,
        )

        self.biencoder = model
        self.optimizer = optimizer
        self.tensorizer = tensorizer
        self.start_epoch = 0
        self.start_batch = 0
        self.scheduler_state = None
        self.best_validation_result = None
        self.best_cp_name = None
        self.cfg = cfg
        self.ds_cfg = BiencoderDatasetsCfg(cfg)

        if saved_state:
            self._load_saved_state(saved_state)

        self.dev_iterator = None