def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, ips=None, args=()): """ Launch multi-gpu or distributed training. This function must be called on all machines involved in the training. It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine. Args: main_func: a function that will be called by `main_func(*args)` num_gpus_per_machine (int): number of GPUs per machine num_machines (int): the total number of machines machine_rank (int): the rank of this machine ips (str): url to connect to for distributed jobs, including protocol e.g. "127.0.0.1". args (tuple): arguments passed to main_func """ world_size = num_machines * num_gpus_per_machine if world_size > 1: options = {} if ips is not None: options.update({'ips': ips}) dist.spawn(main_func, nprocs=num_gpus_per_machine, args=args, **options) else: main_func(*args)
def test_spawn(self): context = dist.spawn(train, backend='cncl', nprocs=4) rank_list = [] for i in range(4): rank_list.append(context.return_queues[i].get()) rank_list.sort() self.assertEqual(rank_list, list(range(4)))
def main(): args = parser.parse_args() os.makedirs(args.save, exist_ok=True) # save the configurations t = time.localtime() timestamp = time.strftime('%b-%d-%Y_%H%M', t) with open(os.path.join(args.save, 'args-{}.txt'.format(timestamp)), 'w') as fh: json.dump(args.__dict__, fh, indent=2) print('Start at : {}'.format(timestamp)) # show non-default args default_args = parser.parse_args([args.data, args.save]) for key in args.__dict__: if args.__dict__[key] != default_args.__dict__[key]: print('{}: {} | default ({})'.format(key, args.__dict__[key], default_args.__dict__[key])) if args.seed is not None: random.seed(args.seed) paddle.seed(args.seed) warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') ngpus_per_node = len(paddle.get_cuda_rng_state()) print('ngpus per node is {}'.format(ngpus_per_node)) if args.distributed: dist.spawn(main_worker, nprocs=ngpus_per_node, args=(args.gpu, ngpus_per_node, args), started_port=6671) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
def main(config, args): if args.nprocs > 1 and args.device == "gpu": dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) else: main_sp(config, args)
def train(): # 1. initialize parallel environment (cpu & gpu) dist.init_parallel_env() # 2. set cpu place paddle.set_device('cpu') # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train, nprocs=2)
layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.CrossEntropyLoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # create data loader dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = paddle.io.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=1) # train for epoch_id in range(EPOCH_NUM): for batch_id, (image, label) in enumerate(loader()): out = layer(image) loss = loss_fn(out, label) loss.backward() adam.step() adam.clear_grad() if dist.get_rank() == 0: print("Epoch {} batch {}: loss = {}".format( epoch_id, batch_id, np.mean(loss.numpy()))) if __name__ == '__main__': dist.spawn(train, nprocs=2, selected_gpus='2,3')
def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train)
train_step += 1 # 固定步数也要保存一次模型 if batch_id % 2000 == 0 and batch_id != 0 and dist.get_rank() == 0: # 保存模型 save_model(args=args, epoch=epoch, model=model, optimizer=optimizer) # 多卡训练只使用一个进程执行评估和保存模型 if dist.get_rank() == 0: # 执行评估 model.eval() cer = evaluate(model, test_loader, greedy_decoder) print('[%s] Test epoch %d, cer: %f' % (datetime.now(), epoch, cer)) writer.add_scalar('Test cer', cer, test_step) test_step += 1 model.train() # 记录学习率 writer.add_scalar('Learning rate', scheduler.last_lr, epoch) # 保存模型 save_model(args=args, epoch=epoch, model=model, optimizer=optimizer) scheduler.step() if __name__ == '__main__': print_arguments(args) dist.spawn(train, args=(args, ))
# 2. initialize parallel environmen dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if dist.get_rank() == 0: print("loss:", loss.numpy()) # paddle.jit.save(dp_layer, "spawn_model/linear", # input_spec=[InputSpec(shape=[None, 10], dtype='float32')]) if __name__ == '__main__': dist.spawn(train, nprocs=4)
# 多卡训练只使用一个进程打印 if batch_id % 100 == 0 and dist.get_rank() == 0: eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id) eta_str = str(timedelta(seconds=int(eta_sec / 1000))) print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % ( datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str)) writer.add_scalar('Train loss', los, train_step) train_step += 1 loss_sum = [] # 多卡训练只使用一个进程执行评估和保存模型 if dist.get_rank() == 0: acc = test(model, metric_fc, test_loader) print('='*70) print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc)) print('='*70) writer.add_scalar('Test acc', acc, test_step) # 记录学习率 writer.add_scalar('Learning rate', scheduler.last_lr, epoch) test_step += 1 save_model(args, epoch, model, metric_fc, optimizer) scheduler.step() if __name__ == '__main__': print_arguments(args) if len(args.gpus.split(',')) > 1: dist.spawn(train, args=(args,), gpus=args.gpus, nprocs=len(args.gpus.split(','))) else: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus train(args)
# 多卡训练只使用一个进程打印 if batch_id % 100 == 0 and dist.get_rank() == 0: eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id) eta_str = str(timedelta(seconds=int(eta_sec / 1000))) print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % ( datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str)) writer.add_scalar('Train loss', los, train_step) train_step += 1 loss_sum = [] # 多卡训练只使用一个进程执行评估和保存模型 if dist.get_rank() == 0: print('='*70) acc = test(model) print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc)) print('='*70) writer.add_scalar('Test acc', acc, test_step) # 记录学习率 writer.add_scalar('Learning rate', scheduler.last_lr, epoch) test_step += 1 save_model(args, epoch, model, metric_fc, optimizer) scheduler.step() save_model(args, args.num_epoch, model, metric_fc, optimizer) if __name__ == '__main__': print_arguments(args) if len(args.gpus.split(',')) > 1: dist.spawn(train, args=(args,), gpus=args.gpus) else: train(args)
def train_from_folder( data='./data', results_dir='./results', models_dir='./models', name='default', new=False, load_from=-1, image_size=128, network_capacity=16, transparent=False, batch_size=5, gradient_accumulate_every=6, num_train_steps=150000, learning_rate=2e-4, lr_mlp=0.1, ttur_mult=1.5, rel_disc_loss=False, num_workers=None, save_every=1000, generate=False, generate_interpolation=False, interpolation_num_steps=100, save_frames=False, num_image_tiles=8, trunc_psi=0.75, mixed_prob=0.9, fp16=False, cl_reg=False, fq_layers=[], fq_dict_size=256, attn_layers=[], no_const=False, aug_prob=0., aug_types=['translation', 'cutout'], top_k_training=False, generator_top_k_gamma=0.99, generator_top_k_frac=0.5, dataset_aug_prob=0., multi_gpus=False, calculate_fid_every=None, seed=42, use_shared_memory=True, # set to False if /dev/shm is limited ): model_args = dict(name=name, results_dir=results_dir, models_dir=models_dir, batch_size=batch_size, gradient_accumulate_every=gradient_accumulate_every, image_size=image_size, network_capacity=network_capacity, transparent=transparent, lr=learning_rate, lr_mlp=lr_mlp, ttur_mult=ttur_mult, rel_disc_loss=rel_disc_loss, num_workers=num_workers, save_every=save_every, trunc_psi=trunc_psi, fp16=fp16, cl_reg=cl_reg, fq_layers=fq_layers, fq_dict_size=fq_dict_size, attn_layers=attn_layers, no_const=no_const, aug_prob=aug_prob, aug_types=cast_list(aug_types), top_k_training=top_k_training, generator_top_k_gamma=generator_top_k_gamma, generator_top_k_frac=generator_top_k_frac, dataset_aug_prob=dataset_aug_prob, calculate_fid_every=calculate_fid_every, mixed_prob=mixed_prob) if generate: model = Trainer(**model_args) model.load(load_from) samples_name = timestamped_filename() model.evaluate(samples_name, num_image_tiles) print( f'sample images generated at {results_dir}/{name}/{samples_name}') return if generate_interpolation: model = Trainer(**model_args) model.load(load_from) samples_name = timestamped_filename() model.generate_interpolation(samples_name, num_image_tiles, num_steps=interpolation_num_steps, save_frames=save_frames) print( f'interpolation generated at {results_dir}/{name}/{samples_name}') return world_size = dist.get_world_size() if world_size == 1 or not multi_gpus: run_training(0, 1, model_args, data, load_from, new, num_train_steps, name, seed, use_shared_memory) return dist.spawn(run_training, args=(world_size, model_args, data, load_from, new, num_train_steps, name, seed, use_shared_memory), nprocs=world_size, join=True)
dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) assert len(loss) == 1 if print_result is True: train_data_list1.append(loss.numpy()) assert len(train_data_list1) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train, args=(True, )) dist.spawn(train) dist.spawn(train, args=(True, ), nprocs=2, gpus='0,1') dist.spawn(train, args=(True, ), nprocs=2)
labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("loss:", loss.numpy()) loss.backward() adam.step() adam.clear_grad() # Usage 1: only pass function. # If your training method no need any argument, and # use all visible devices for parallel training. if __name__ == '__main__': dist.spawn(train) # Usage 2: pass function and arguments. # If your training method need some arguments, and # use all visible devices for parallel training. if __name__ == '__main__': dist.spawn(train, args=(True,)) # Usage 3: pass function, arguments and nprocs. # If your training method need some arguments, and # only use part of visible devices for parallel training. # If your machine hold 8 cards {0,1,2,3,4,5,6,7}, # this case will use cards {0,1}; If you set # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use # cards {4,5} if __name__ == '__main__':
set process name on local machine """ if config.general.is_cloud: return if tag_base.startswith('train'): tag_base = 'train' import setproctitle setproctitle.setproctitle(tag_base + '_' + config.data.output.rstrip('/') .split('/')[-1]) if __name__ == "__main__": config = global_config.gen_config() init_env(config) run_mode = config.general.mode if run_mode == 'preproc': preprocess(config) sys.exit(0) _set_proc_name(config, run_mode) if run_mode == 'test': evaluate(config) elif run_mode == 'infer': inference(config) elif run_mode.startswith('train'): if config.train.use_data_parallel: dist.spawn(train, args=(config, )) else: train(config)
if args.do_train: # If do_eval=True, use best model to evaluate the test data. # Otherwise, use final model to evaluate the test data. if args.do_eval: args.init_from_ckpt = os.path.join(args.output_dir, 'best') load_ckpt(args, model) else: if not args.init_from_ckpt: raise ValueError('"init_from_ckpt" should be set.') load_ckpt(args, model) print('\nTest begin...') evaluation(args, model, test_data_loader, metric) def print_args(args): print('----------- Configuration Arguments -----------') for arg, value in sorted(vars(args).items()): print('%s: %s' % (arg, value)) print('------------------------------------------------') if __name__ == '__main__': args = parse_args() set_default_args(args) print_args(args) if args.n_gpu > 1: dist.spawn(main, args=(args, ), nprocs=args.n_gpu) else: main(args)
def train(world_size=2): if world_size > 1: dist.spawn(do_train, nprocs=world_size, args=()) else: do_train()