def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. model = DDP(model) return model
def get_model(deepspeed_config_path): num_local_heads = 16 sparse_mode = 'alternating' deepspeed_sparsity_config = get_sparse_attention_config( deepspeed_config_path, num_local_heads) if deepspeed_sparsity_config is not None: logger.info(f"Use sparse attention with mode {sparse_mode}") else: logger.info(f"Use dense attention") model = GPT2Model(num_layers=24, vocab_size=50264, hidden_size=2048, num_attention_heads=num_local_heads, embedding_dropout_prob=0.1, attention_dropout_prob=0.1, output_dropout_prob=0.1, max_sequence_length=2048, checkpoint_activations=False, checkpoint_num_layers=1, parallel_output=False, deepspeed_sparsity_config=deepspeed_sparsity_config, sparse_mode=sparse_mode) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. model = FP16_Module(model) return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, relative_encoding=args.transformer_xl) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if hasattr(args, "deepspeed") and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed: if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def get_model(args, config, do_fp16=False): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(**config, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and do_fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if do_fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
default=100) parser.add_argument('--print_every', help='Number of Steps for Training', default=10) parser.add_argument('--sample_every', help='Number of Steps for Training', default=200) parser.add_argument('--save_every', help='Number of Steps for Training', default=500) parser.add_argument('--model_type', help="Which model to use for finetuning", default='124M') args = parser.parse_args() gpt2_model = GPT2Model(model_type=args.model_type) # Start finetuning on first chunk gpt2_model.fit(input_path='train_1.txt', print_every=int(args.print_every), sample_every=int(args.sample_every), save_every=int(args.save_every), num_steps=int(args.nsteps)) # Load the tuned model and finetune on remaining chunks for i in range(2, 10): gpt2_model.fit(input_path='train_' + str(i) + '.txt', overwrite=True, restore_from='latest', print_every=int(args.print_every), sample_every=int(args.sample_every),