train_args = get_args() params = get_params(train_args.model) # tokenizer tokenizer = get_tokenizer( tokenizer_type=params["tokenizer"].get("type", None), from_pretrained=params["tokenizer"].get("from_pretrained", True), add_padding_token=params["tokenizer"].get("add_padding_token", False)) vocab_size = len( tokenizer) if params["vocab_size"] is None else params["vocab_size"] # instantiate GPT-like decoder model params["seq_len"] = 2049 model = GPTNeoX(num_tokens=vocab_size, dim=params["hidden_dim"], seq_len=params["seq_len"], depth=params["n_layers"], heads=params["n_heads"], dim_head=params["dim_head"]) model = AutoregressiveWrapper(model) # prepare data dset_params = params["dataset"] assert dset_params is not None deepspeed.init_distributed(dist_backend='nccl') torch.distributed.barrier( ) # barrier will force processes to stop until *all* processes have reached the barrier # if is_main(train_args): # prepare_data(dset_params["name"]) # torch.distributed.barrier() # barrier will force processes to stop until *all* processes have reached the barrier
train_args = get_args() params = get_params(train_args.model) # tokenizer tokenizer = get_tokenizer( tokenizer_type=params["tokenizer"].get("type", None), from_pretrained=params["tokenizer"].get("from_pretrained", True), add_padding_token=params["tokenizer"].get("add_padding_token", False)) vocab_size = len( tokenizer) if params["vocab_size"] is None else params["vocab_size"] # instantiate GPT-like decoder model model = GPTNeoX(num_tokens=vocab_size, dim=params["hidden_dim"], seq_len=params["seq_len"], depth=params["n_layers"], heads=params["n_heads"], dim_head=params["dim_head"]) model = AutoregressiveWrapper(model) # prepare data dset_params = params["dataset"] assert dset_params is not None deepspeed.init_distributed(dist_backend='nccl') torch.distributed.barrier( ) # barrier will force processes to stop until *all* processes have reached the barrier if is_main(train_args): prepare_data(dset_params["name"]) torch.distributed.barrier(
torch.distributed.barrier() if __name__ == '__main__': #arguments IS_MAIN = is_main(train_args) deepspeed.init_distributed(dist_backend='nccl') # only display system stats from one worker per machine wandb_settings = wandb.Settings() if is_main(train_args) else wandb.Settings(_disable_stats=True) name = f'{socket.gethostname()}-{train_args.local_rank}' if train_args.group_name else None if train_args.mode == 'no_pipeline': model = GPTNeoX( num_tokens=vocab_size, dim=params["hidden_dim"], seq_len=params["seq_len"], depth=params["n_layers"], heads=params["n_heads"], dim_head=params["dim_head"] ) use_wandb = True try: wandb.init(project="neox_train_enwik8", group=train_args.group_name, name=name, save_code=True, force=False, entity=params.get('wandb', {}).get('team'), settings=wandb_settings) except UsageError as e: use_wandb = False print(e) print('Skipping wandb. Execute `wandb login` on local machine to enable.') model = AutoregressiveWrapper(model) # prepare data dset_params = params["dataset"]