def main():
    parser = HfArgumentParser((AlbertTrainingArguments, DatasetArguments, CollaborationArguments))
    training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir)

    tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir)

    # find latest checkpoint in output_dir
    output_dir = Path(training_args.output_dir)
    logger.info(f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}')
    latest_checkpoint_dir = max(output_dir.glob('checkpoint*'), default=None, key=os.path.getctime)

    if latest_checkpoint_dir is not None:
        logger.info(f'Loading model from {latest_checkpoint_dir}')
        model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir)
    else:
        logger.info(f'Training from scratch')
        model = AlbertForPreTraining(config)
        model.resize_token_embeddings(len(tokenizer))

    tokenized_dataset_path = Path(dataset_args.dataset_path)

    tokenized_datasets = load_from_disk(tokenized_dataset_path)

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": training_args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = FusedLAMB(
        optimizer_grouped_parameters,
        lr=training_args.learning_rate,
        betas=(training_args.adam_beta1, training_args.adam_beta2),
        eps=training_args.adam_epsilon,
    )

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps
    )

    trainer = CollaborativeTrainer(
        model=model, args=training_args, collaboration_args=collaboration_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        optimizers=(optimizer, lr_scheduler)
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=latest_checkpoint_dir)
示例#2
0
class CheckpointHandler:
    def __init__(self, coordinator_args: CoordinatorArguments,
                 collab_optimizer_args: CollaborativeOptimizerArguments,
                 averager_args: AveragerArguments, dht: hivemind.DHT):
        self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval
        self.repo_path = coordinator_args.repo_path
        self.upload_interval = coordinator_args.upload_interval
        self.previous_step = -1

        config = AlbertConfig.from_pretrained(
            coordinator_args.model_config_path)
        self.model = AlbertForPreTraining(config)

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        opt = Lamb(
            optimizer_grouped_parameters,
            lr=0.00176,
            weight_decay=0.01,
            clamp_value=10000.0,
            debias=True,
        )

        adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead

        self.collaborative_optimizer = hivemind.CollaborativeOptimizer(
            opt=opt,
            dht=dht,
            prefix=experiment_prefix,
            compression_type=hivemind.utils.CompressionType.Value(
                collab_optimizer_args.compression),
            throughput=collab_optimizer_args.bandwidth,
            target_batch_size=adjusted_target_batch_size,
            client_mode=collab_optimizer_args.client_mode,
            verbose=True,
            start=True,
            **asdict(averager_args))
        self.previous_timestamp = time.time()

    def is_time_to_save_state(self, cur_step):
        if self.save_checkpoint_step_interval is None:
            return False
        elif cur_step - self.previous_step >= self.save_checkpoint_step_interval:
            return True
        else:
            return False

    def save_state(self, cur_step):
        self.collaborative_optimizer.load_state_from_peers()
        self.previous_step = cur_step

    def is_time_to_upload(self):
        if self.repo_path is None:
            return False
        elif time.time() - self.previous_timestamp >= self.upload_interval:
            return True
        else:
            return False

    def upload_checkpoint(self, current_loss):
        self.model.save_pretrained(self.repo_path)
        torch.save(self.collaborative_optimizer.opt.state_dict(),
                   f"{self.repo_path}/optimizer_state.pt")
        self.previous_timestamp = time.time()
        try:
            subprocess.run("git add --all",
                           shell=True,
                           check=True,
                           cwd=self.repo_path)
            current_step = self.collaborative_optimizer.collaboration_state.optimizer_step
            subprocess.run(
                f"git commit -m 'Step {current_step}, loss {current_loss:.3f}'",
                shell=True,
                check=True,
                cwd=self.repo_path)
            subprocess.run("git push",
                           shell=True,
                           check=True,
                           cwd=self.repo_path)
        except subprocess.CalledProcessError as e:
            logger.warning("Error while uploading model:", e.output)
示例#3
0
def main():
    # my dice shows 777 only. period.
    random.seed(EXPCONF.seed)
    np.random.seed(EXPCONF.seed)
    torch.manual_seed(EXPCONF.seed)
    torch.cuda.manual_seed_all(EXPCONF.seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    trainloader, vocab, _trainds = get_loader(EXPCONF, getdev=False)
    devloader, _, _devds = get_loader(EXPCONF, getdev=True)

    assert len(trainloader) > 0, f"trainloader is empty!"
    assert len(devloader) > 0, f"devloader is empty!"

    # this is disgraceful.... but just specify things below
    albertconf = AlbertConfig.from_pretrained(
        f'albert-{EXPCONF.albert_scale}-v2')
    if EXPCONF.smaller:  #originally used 4H for FFN but for memory issue, use 1H for FFN
        albertconf.hidden_size = EXPCONF.hidden_size
        albertconf.num_hidden_layers = EXPCONF.num_hidden_layers
        albertconf.num_attention_heads = EXPCONF.num_attention_heads

        albertconf.intermediate_size = albertconf.hidden_size

    albertconf.vocab_size = len(vocab.itos)
    albertconf.bos_token_id = vocab.stoi['BOS']
    albertconf.eos_token_id = vocab.stoi['EOS']
    albertconf.pad_token_id = vocab.stoi['PAD']
    albertconf.max_position_embeddings = 40

    model = AlbertForPreTraining(albertconf).to(device)

    # huggingface example is doing this for language modeling...
    # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py
    no_decay = ['bias', "LayerNorm.weight"]
    grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            EXPCONF.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(grouped_parameters,
                      lr=EXPCONF.lr)  # otherwise, use default
    getsch = get_cosine_schedule_with_warmup if EXPCONF.scheduler == 'cosine' else get_linear_schedule_with_warmup
    scheduler = getsch(optimizer, EXPCONF.warmups,
                       EXPCONF.numep * len(trainloader))

    global_step = 0
    L = len(trainloader)
    bsz = len(trainloader[0])

    for ep in tqdm(range(1, EXPCONF.numep + 1), desc="epoch progress"):
        lossep_mlm = 0
        lossep_pp = 0
        accep_pp = 0
        model.train()
        for i, (b, l, datasetids) in enumerate(
                tqdm(trainloader, desc="iterations progress"), 1):
            '''
            b.input_ids/token_type_ids/attention_mask .shape ==  (bsz, seqmaxlen,)
            b.l.shape == (bsz,)

            ## bert families, when they do MLM with NSP (or other similar sentence based tasks,)
            ## they just uses masked input for their sentence representation encoding, not the unmasked ones
            ## it could be considered as some kind of dropout but at first it looked quite irregular to me.

            ## --> referred to transformers/examples/run_language_modeling.py (v2.1.0)
            ## --> modeling_albert.py ( class AlbertModel.forward() )
            '''

            outputs = model(**b, sentence_order_label=l, return_dict=True)
            global_step += 1

            vsz = outputs.prediction_logits.shape[-1]

            lossmlm = F.cross_entropy(
                outputs.prediction_logits.view(-1, vsz).contiguous(),
                b['labels'].view(-1))
            losspp = F.cross_entropy(outputs.sop_logits, l)
            lossppval = losspp.item()
            acc = accuracy(outputs.sop_logits.clone().detach(), l)

            if EXPCONF.alpha_pp == 1 and not EXPCONF.alpha_warmup:
                outputs.loss.backward()
            else:
                del outputs.loss
                torch.cuda.empty_cache()

                losspp *= EXPCONF.alpha_pp

                if EXPCONF.alpha_warmup:
                    grow = min(global_step / EXPCONF.warmups, 1.0)
                    losspp *= grow

                loss = lossmlm + losspp
                loss.backward()

            wandb.log({
                'step':
                (i + ep * L) * bsz if EXPCONF.see_bsz_effect else global_step,
                'train_step/learning_rate':
                get_lr_from_optim(optimizer),
                'train_step/alpha_pp':
                EXPCONF.alpha_pp * (grow if EXPCONF.alpha_warmup else 1),
                'train_step/mlm_loss':
                lossmlm.item(),
                'train_step/pp_loss':
                lossppval,
                'train_step/pp_acc':
                acc,
            })

            optimizer.step()
            scheduler.step()
            model.zero_grad()

            lossep_mlm += lossmlm.item()
            lossep_pp += lossppval
            accep_pp += acc

        lossep_mlm /= L
        lossep_pp /= L
        accep_pp /= L

        wandb.log({
            'step': ep,
            'train_ep/mlm_loss': lossep_mlm,
            'train_ep/pp_loss': lossep_pp,
            'train_ep/pp_acc': accep_pp,
        })
        print(f"ep:{ep}: losspp = {lossep_pp}, lossmlm={lossep_mlm}")
        devmlm_loss, devpp_loss, devpp_acc = evaldev(EXPCONF, model, devloader,
                                                     ep)
        if devpp_acc > EXPCONF.savethld:
            savemodel(EXPCONF,
                      model,
                      vocab,
                      ep,
                      mlm=devmlm_loss,
                      pp=devpp_loss,
                      acc=devpp_acc)
    return None