def run_mp_worker(args, available_workers): new_data = True blob = make_model_and_data(args, None, new_data=new_data) model = blob["model"] balance = generate_balance(min(available_workers, 8), len(model)) p = pipe.Pipe( model, balance, style=Pipe.MultiProcess, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.cuda.current_device(), pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint, ).cuda() if args.all_at_once and p.pipeline: print(f"running all at once") p.pipeline.all_at_once = True if new_data: train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args) else: ntokens, train_data, val_data, test_data = blob["data"] benchmark_language_model(train_data, val_data, test_data, p, criterion, optimizer, ntokens, args)
def bench_single_process(args): num_devices = torch.cuda.device_count() assert num_devices > 0 init_random_seed(0) device = torch.device("cuda") new_data = True blob = make_model_and_data(args, None, new_data=new_data) model = blob["model"] balance = generate_balance(min(num_devices, 8), len(model)) p = pipe.Pipe(model, balance, chunks=args.chunks, pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint) del model del blob["model"] if new_data: train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args) else: ntokens, train_data, val_data, test_data = blob["data"] benchmark_language_model(train_data, val_data, test_data, p, criterion, optimizer, ntokens, args)
def run_mp_worker(args, available_workers): new_data = True blob = make_model_and_data(args, None, new_data=new_data) model = blob["model"] balance = generate_balance_weighted(get_pipeline_parallel_group().size(), len(model), 0.8) p = pipe.Pipe( model, balance, style=Pipe.AsyncSchedule, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint, # loss_fn=blob["criterion"], ) if torch.cuda.is_available(): p = p.cuda() if args.all_at_once and p.pipeline: print(f"running all at once") p.pipeline.all_at_once = True if new_data: train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args) else: ntokens, train_data, val_data, test_data = blob["data"] benchmark_language_model(train_data, val_data, test_data, p, criterion, optimizer, ntokens, args)
def run_mp_worker(args, available_workers): benchmark_config = create_benchmark_config(args.model_name) model_config = create_model_config(args, config=benchmark_config) model = model_config["model"] balance = generate_balance_weighted(get_pipeline_parallel_group().size(), len(model), 0.8) pipe_model = pipe.Pipe( model, balance, style=Pipe.AsyncSchedule, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint, # TODO(anj-s): Do we need to comment this out? loss_fn=benchmark_config["criterion"], ) if torch.cuda.is_available(): pipe_model = pipe_model.cuda() if args.all_at_once and pipe_model.pipeline: print(f"running all at once") pipe_model.pipeline.all_at_once = True if args.use_synthetic_data: train(model_config, pipe_model, benchmark_config, args) else: benchmark_language_model(model_config, pipe_model, benchmark_config, args)
def benchmark_single_process(args): """Benchmark a given model using a single process and multiple devices.""" num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1 assert num_devices > 0 init_random_seed(0) benchmark_config = create_benchmark_config(args.model_name) model_config = create_model_config(args, config=benchmark_config) model = model_config["model"] balance = generate_balance(min(num_devices, 4), len(model)) pipe_model = pipe.Pipe(model, balance, chunks=args.chunks, pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint) del model del model_config["model"] if args.dry_run: train(model_config, pipe_model, benchmark_config, args) else: benchmark_language_model(model_config, pipe_model, benchmark_config, args)
print("No regression detected") def generate_balance(num_devices, num_layers): balance = [] layers_assigned = 0 for i in range(num_devices): x = (num_layers - layers_assigned) / (num_devices - i) if x.is_integer(): balance.append(int(x)) layers_assigned += x else: balance.append(math.ceil(x)) layers_assigned += math.ceil(x) return balance if __name__ == "__main__": num_devices = torch.cuda.device_count() assert num_devices > 0 torch.manual_seed(0) device = torch.device("cuda") ntokens, train_data, val_data, test_data = get_data(device) model, criterion, optimizer = make_model(device, ntokens) balance = generate_balance(min(num_devices, 4), len(model)) p = pipe.Pipe(model, balance) benchmark_language_model(train_data, val_data, test_data, p, criterion, optimizer, ntokens) del p