def run(args): if args.do_train: train_dataloader, processor = load_example(args, 'train') num_label = len(processor.get_labels()) model = RemBertForSequenceClassification.from_pretrained( args.model_type, num_classes=num_label) if nranks > 1: dist.init_parallel_env() model = paddle.DataParallel(model) num_train_steps_per_epoch = len( train_dataloader) // args.gradient_accumulation_steps num_train_steps = int(num_train_steps_per_epoch * args.num_train_epochs) trainer = Trainer(args, model=model, dataloader=train_dataloader, num_train_steps=num_train_steps, step_callback=evaluate) trainer.train() if args.do_eval: model = RemBertForSequenceClassification.from_pretrained( args.output_dir) evaluate(model, args)
def train(print_result=True): """train""" # 1. initialize parallel environment train_data_list1 = [] train_data_list2 = [] dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) assert len(loss) == 1 if print_result is True: train_data_list1.append(loss.numpy()) assert len(train_data_list1) loss.backward() adam.step() adam.clear_grad()
def run_trainer_func(self, args): if fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: assert ("Only support CUDAPlace for now.") with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) model, train_reader, opt = self.get_model() if args.update_method == "nccl2": dist.init_parallel_env() print_to_err( type(self).__name__, "begin to prepare context in dygraph with nccl2") model = paddle.DataParallel( model, find_unused_parameters=args.find_unused_parameters) print_to_err(type(self).__name__, "model built in dygraph") out_losses = self.model_train(args, model, opt, train_reader) print_to_out(out_losses) return out_losses
def train(): # init env dist.init_parallel_env() # create network layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.CrossEntropyLoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # create data loader dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = paddle.io.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=1) # train for epoch_id in range(EPOCH_NUM): for batch_id, (image, label) in enumerate(loader()): out = layer(image) loss = loss_fn(out, label) loss.backward() adam.step() adam.clear_grad() if dist.get_rank() == 0: print("Epoch {} batch {}: loss = {}".format( epoch_id, batch_id, np.mean(loss.numpy())))
def train(): # 设置支持多卡训练 dist.init_parallel_env() train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=32, shuffle=True) train_loader = DataLoader(dataset=train_dataset, batch_sampler=batch_sampler) model = paddle.vision.mobilenet_v2(num_classes=10) # 设置支持多卡训练 model = paddle.DataParallel(model) # 设置优化方法 optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=0.1, weight_decay=5e-4) # 获取损失函数 loss = paddle.nn.CrossEntropyLoss() # 开始训练 for epoch in range(10): for batch_id, (img, label) in enumerate(train_loader()): output = model(img) # 计算损失值 los = loss(output, label) los.backward() if dist.get_rank() == 0: print("Epoch {}: batch_id {}, loss {}".format( epoch, batch_id, los)) optimizer.step() optimizer.clear_grad()
def train(print_result=False): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("loss:", loss.numpy()) loss.backward() adam.step() adam.clear_grad()
def train(print_result=False): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("Rank:", int(os.getenv("PADDLE_TRAINER_ID"))) loss.backward() adam.step() adam.clear_grad() return int(os.getenv("PADDLE_TRAINER_ID"))
def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad()
def train(print_result=True): # 1. enable dynamic mode # device = paddle.set_device('gpu') # paddle.disable_static(device) # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) dataset = FakeDataset() # loader = paddle.io.DataLoader(dataset, batch_size=2, places=device, num_workers=2) loader = paddle.io.DataLoader(dataset, batch_size=2, num_workers=2) # 4. run layer for inputs, labels in loader: # inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) # labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("loss:", loss.numpy()) # loss = dp_layer.scale_loss(loss) loss.backward() # dp_layer.apply_collective_grads() adam.step() adam.clear_grad()
def test_check_env_failed(self): os.environ['FLAGS_selected_gpus'] = '0' os.environ['PADDLE_TRAINER_ID'] = '0' os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170' os.environ['PADDLE_TRAINERS_NUM'] = '2' with self.assertRaises(ValueError): dist.init_parallel_env()
def test_multiple_gpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() model_a = SimpleNet(self.trainer_id, 0) model_b = SimpleNet(self.trainer_id, 1) state_dict = model_a.state_dict() model_b.set_state_dict(state_dict) model_a = paddle.DataParallel(model_a) model_b = paddle.DataParallel(model_b) for step in range(10): x_data = np.random.randn(batch, in_dim).astype(np.float32) x = paddle.to_tensor(x_data) x.stop_gradient = False with model_a.no_sync(): y_pred_a = model_a(x) loss_a = y_pred_a.mean() loss_a.backward() fused_allreduce_gradients(list(model_a.parameters()), None) y_pred_b = model_b(x) loss_b = y_pred_b.mean() loss_b.backward() self.check_gradient(model_a.parameters()) self.check_gradient(model_b.parameters()) self.check_acc(model_a._layers.w.grad, model_b._layers.w.grad) model_a.clear_gradients() model_b.clear_gradients()
def finetune(args): paddle.set_device(args.device) if dist.get_world_size() > 1: dist.init_parallel_env() pos_file = os.path.join(args.data_dir, 'rt-polarity.pos') neg_file = os.path.join(args.data_dir, 'rt-polarity.neg') x_text, y = load_data_and_labels(pos_file, neg_file) x_train, x_test, y_train, y_test = train_test_split(x_text, y, test_size=0.1, random_state=args.seed) if not args.init_from_ckpt: raise ValueError('`init_from_ckpt` should be set.') model = ELMoBowTextClassification(args.init_from_ckpt, args.batch_size, args.sent_embedding_dim, args.dropout, args.num_classes) if dist.get_world_size() > 1: model = paddle.DataParallel(model) model.train() adam = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() vocab = load_vocab() train_dataset = SentencePolarityDatasetV1(x_train, y_train, vocab, args.max_seq_len) test_dataset = SentencePolarityDatasetV1(x_test, y_test, vocab, args.max_seq_len) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, return_list=True, shuffle=True, collate_fn=lambda batch: generate_batch(batch)) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, return_list=True, shuffle=False, collate_fn=lambda batch: generate_batch(batch)) for epoch in range(args.epochs): print('Epoch {}/{}'.format(epoch + 1, args.epochs)) for step, batch_data in enumerate(train_loader, start=1): ids, ids_reverse, label = batch_data output = model((ids, ids_reverse)) loss = criterion(output, label) loss.backward() adam.step() adam.clear_grad() if step % args.logging_step == 0: print('step {}, loss {}'.format(step, loss.numpy()[0])) acc = test(model, test_loader) print('\ntest acc {}\n'.format(acc))
def train(): # enable dygraph mode paddle.disable_static() dist.init_parallel_env() # create network layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.CrossEntropyLoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # print(core._get_device_properties(dist.ParallelEnv().device_id)) # create data loader # loader = paddle.io.DataLoader.from_generator(capacity=5, use_multiprocess=True) loader = paddle.io.DataLoader.from_generator(capacity=5) loader.set_batch_generator(random_batch_reader()) for epoch_id in range(EPOCH_NUM): for batch_id, (image, label) in enumerate(loader()): out = layer(image) loss = loss_fn(out, label) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad() print("Epoch {} batch {}: loss = {}".format( epoch_id, batch_id, np.mean(loss.numpy())))
def main(config, device, logger, vdl_writer): # init dist environment if config['Global']['distributed']: dist.init_parallel_env() global_config = config['Global'] # build dataloader train_dataloader = build_dataloader(config, 'Train', device, logger) if len(train_dataloader) == 0: logger.error( "No Images in train dataset, please ensure\n" + "\t1. The images num in the train label_file_list should be larger than or equal with batch size.\n" + "\t2. The annotation file and path in the configuration file are provided normally." ) return if config['Eval']: valid_dataloader = build_dataloader(config, 'Eval', device, logger) else: valid_dataloader = None # build post process post_process_class = build_post_process(config['PostProcess'], global_config) # build model # for rec algorithm if hasattr(post_process_class, 'character'): char_num = len(getattr(post_process_class, 'character')) config['Architecture']["Head"]['out_channels'] = char_num model = build_model(config['Architecture']) if config['Global']['distributed']: model = paddle.DataParallel(model) # build loss loss_class = build_loss(config['Loss']) # build optim optimizer, lr_scheduler = build_optimizer( config['Optimizer'], epochs=config['Global']['epoch_num'], step_each_epoch=len(train_dataloader), parameters=model.parameters()) # build metric eval_class = build_metric(config['Metric']) # load pretrain model pre_best_model_dict = init_model(config, model, logger, optimizer) logger.info('train dataloader has {} iters'.format(len(train_dataloader))) if valid_dataloader is not None: logger.info('valid dataloader has {} iters'.format( len(valid_dataloader))) # start train program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, eval_class, pre_best_model_dict, logger, vdl_writer)
def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. get current ParallelEnv parallel_env = dist.ParallelEnv() print("rank: ", parallel_env.rank) print("world_size: ", parallel_env.world_size)
def test_init_parallel_env_break(self): os.environ['FLAGS_selected_gpus'] = '0' os.environ['PADDLE_TRAINER_ID'] = '0' os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170' os.environ['PADDLE_TRAINERS_NUM'] = '1' os.environ['PADDLE_TRAINER_ENDPOINTS'] = '127.0.0.1:6170' # coverage success branch dist.init_parallel_env() self.assertFalse(parallel_helper._is_parallel_ctx_initialized())
def train(): """parallelenv""" # 1. initialize parallel env dist.init_parallel_env() # 2. get current ParallelEnv parallel_env = dist.ParallelEnv() assert parallel_env.rank == 0 assert parallel_env.world_size == 2 print("test_ParallelEnv ... ok")
def do_train(): dist.init_parallel_env() net = paddle.nn.Linear(2, 2) net = paddle.DataParallel(net) x = paddle.to_tensor(np.random.random(size=(2, 2)).astype('float32')) j = [] y = net(x) dist.all_gather(j, y) print(j)
def main(config, device, logger, vdl_writer): # init dist environment if config['Global']['distributed']: dist.init_parallel_env() global_config = config['Global'] # build dataloader train_dataloader = build_dataloader(config, 'Train', device, logger) if config['Eval']: valid_dataloader = build_dataloader(config, 'Eval', device, logger) else: valid_dataloader = None # build post process post_process_class = build_post_process(config['PostProcess'], global_config) # build model # for rec algorithm if hasattr(post_process_class, 'character'): char_num = len(getattr(post_process_class, 'character')) config['Architecture']["Head"]['out_channels'] = char_num model = build_model(config['Architecture']) if config['Global']['distributed']: model = paddle.DataParallel(model) # build loss loss_class = build_loss(config['Loss']) # build optim optimizer, lr_scheduler = build_optimizer( config['Optimizer'], epochs=config['Global']['epoch_num'], step_each_epoch=len(train_dataloader), parameters=model.parameters()) # build metric eval_class = build_metric(config['Metric']) # load pretrain model pre_best_model_dict = init_model(config, model, logger, optimizer) logger.info( 'train dataloader has {} iters, valid dataloader has {} iters'.format( len(train_dataloader), len(valid_dataloader))) quanter = QAT(config=quant_config, act_preprocess=PACT) quanter.quantize(model) # start train program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, eval_class, pre_best_model_dict, logger, vdl_writer)
def train(): """bergin train""" arr1 = [] arr2 = [] dist.init_parallel_env() set_seed(2021) layer = LinearNet() if dist.get_world_size() > 1: dp_layer = paddle.DataParallel(layer) else: dp_layer = layer layer2 = LinearNet() if dist.get_world_size() > 1: dp_layer2 = paddle.DataParallel(layer2) else: dp_layer2 = layer2 dp_layer2.set_state_dict(dp_layer.state_dict()) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) adam2 = opt.Adam( learning_rate=0.001, parameters=dp_layer2.parameters()) for i in range(2): batch_size = 10 shard = int(batch_size / dist.get_world_size()) start_no = shard * dist.get_rank() end_no = start_no + shard inputs = paddle.randn([10, 10], 'float32')[start_no:end_no] outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32')[start_no:end_no] loss = loss_fn(outputs, labels) if dist.get_rank() == 0: arr1.append(loss.numpy()[0]) loss.backward() adam.step() adam.clear_grad() outputs = dp_layer2(inputs) loss = loss_fn(outputs, labels) loss.backward() if dist.get_rank() == 0: arr2.append(loss.numpy()[0]) adam2.step() adam2.clear_grad() check_data(arr1, arr2)
def run_trainer(self, args): if fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: assert ("Only support CUDAPlace for now.") with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) model, train_reader, opt = self.get_model() if args.update_method == "nccl2": dist.init_parallel_env() print_to_err( type(self).__name__, "begin to prepare context in dygraph with nccl2") if not args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=False) else: model = paddle.DataParallel(model, find_unused_parameters=True) print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: if args.update_method == "nccl2": with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) print_to_err( type(self).__name__, "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) if not args.accumulate_gradient: model.clear_gradients() print_to_out(out_losses)
def train(): dist.init_parallel_env() # 1. initialize parallel environment set_seed(2021) # 2. create data parallel layer & optimizer layer = LinearNet() if dist.get_world_size() > 1: dp_layer = paddle.DataParallel(layer) else: dp_layer = layer layer2 = LinearNet() if dist.get_world_size() > 1: dp_layer2 = paddle.DataParallel(layer2) else: dp_layer2 = layer2 dp_layer2.set_state_dict(dp_layer.state_dict()) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) adam2 = opt.Adam(learning_rate=0.001, parameters=dp_layer2.parameters()) # 3. run layer print("Start") for i in range(10): batch_size = 10 shard = int(batch_size / dist.get_world_size()) start_no = shard * dist.get_rank() end_no = start_no + shard inputs = paddle.randn([10, 10], 'float32')[start_no:end_no] outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32')[start_no:end_no] loss = loss_fn(outputs, labels) if dist.get_rank() == 0: print("Loss1", loss.numpy()[0]) print(dp_layer.parameters()) loss.backward() adam.step() adam.clear_grad() outputs = dp_layer2(inputs) loss = loss_fn(outputs, labels) loss.backward() if dist.get_rank() == 0: print("Loss2", loss.numpy()[0]) print(dp_layer2.parameters()) adam2.step() adam2.clear_grad()
def test_send1(): """test_send1""" init_parallel_env() if paddle.distributed.ParallelEnv().rank == 0: data = paddle.to_tensor([7, 8, 9]) paddle.distributed.send(data, dst=1) else: data = paddle.to_tensor([1, 2, 3]) paddle.distributed.recv(data, src=0) out = data.numpy() assert out[0] == 7 assert out[1] == 8 assert out[2] == 9
def test_send3(): """test_send3""" init_parallel_env() if paddle.distributed.ParallelEnv().rank == 0: data = paddle.to_tensor([7, 8, 9]) paddle.distributed.send(data, dst=1, group=None, use_calc_stream=False) else: data = paddle.to_tensor([1, 2, 3]) paddle.distributed.recv(data, src=0) out = data.numpy() assert out[0] == 7 assert out[1] == 8 assert out[2] == 9
def train(config): logging.info('training arguments: %s', config) if config.train.use_data_parallel: logging.info("parallel mode. init env...") dist.init_parallel_env() dataset_config = { 'db_file': config.data.db, 'input_encoder': g_input_encoder, 'label_encoder': g_label_encoder, 'is_cached': True } train_set = DatasetClass( name='train', data_file=config.data.train_set, **dataset_config) dev_set = DatasetClass( name='dev', data_file=config.data.dev_set, **dataset_config) shuf_train = True if not config.general.is_debug else False train_reader = DataLoaderClass( config, train_set, batch_size=config.general.batch_size, shuffle=shuf_train) #dev_reader = dataproc.DataLoader(config, dev_set, batch_size=config.general.batch_size, shuffle=False) dev_reader = DataLoaderClass(config, dev_set, batch_size=1, shuffle=False) max_train_steps = config.train.epochs * ( len(train_set) // config.general.batch_size // config.train.trainer_num) model = ModelClass(config.model, g_label_encoder) if config.model.init_model_params is not None: logging.info("loading model param from %s", config.model.init_model_params) model.set_state_dict(paddle.load(config.model.init_model_params)) if config.train.use_data_parallel: logging.info("parallel mode. init model...") model = paddle.DataParallel(model) optimizer = text2sql.optim.init_optimizer(model, config.train, max_train_steps) if config.model.init_model_optim is not None: logging.info("loading model optim from %s", config.model.init_model_optim) optimizer.set_state_dict(paddle.load(config.model.init_model_optim)) logging.info("start of training...") launch.trainer.train(config, model, optimizer, config.train.epochs, train_reader, dev_reader) logging.info("end of training...")
def init_process_group(strategy=None): nranks = ParallelEnv().nranks rank = ParallelEnv().local_rank is_master = True if rank == 0 else False pg_group = dist.init_parallel_env() return pg_group.process_group
def test_multiple_gpus(self): dist.init_parallel_env() self.trainer_id = dist.get_rank() model_a = SimpleNet(self.trainer_id) model_b = SimpleNet(self.trainer_id) state_dict = model_a.state_dict() model_b.set_state_dict(state_dict) model_a = paddle.DataParallel(model_a, find_unused_parameters=True) model_b = paddle.DataParallel(model_b, find_unused_parameters=True) ones_input = paddle.ones(shape=(batch, in_dim)) ones_input.stop_gradient = True w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') for step_id in range(5): random_input = paddle.rand(shape=(batch, in_dim)) random_input.stop_gradient = True if step_id % 2 == 0: out_a = model_a(random_input) out_b = model_b(random_input) else: out_a = model_a(ones_input) out_b = model_b(ones_input) out_a.sum().backward() out_b.sum().backward() self.check_gradient(model_a.parameters()) self.check_gradient(model_b.parameters()) # test acc gradient w1_grad_sum = self.check_acc(model_a._layers.w1.grad, w1_grad_sum, model_b._layers.w1.grad) w2_grad_sum = self.check_acc(model_a._layers.w2.grad, w2_grad_sum, model_b._layers.w2.grad) model_a.clear_gradients()
def run_trainer_with_spawn(self, args): paddle.disable_static() fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) args.trainer_id = dist.get_rank() if args.update_method == "nccl2": dist.init_parallel_env() model, train_reader, opt = self.get_model() if args.update_method == "nccl2": if args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=True) else: model = paddle.DataParallel(model, find_unused_parameters=False) out_losses = [] for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: if args.update_method == "nccl2": with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) print_to_err( type(self).__name__, "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) model.clear_gradients() print_to_out(out_losses) return out_losses
def main(config, device, logger, vdl_writer): # init dist environment if config['Global']['distributed']: dist.init_parallel_env() global_config = config['Global'] # build dataloader config['Train']['loader']['num_workers'] = 0 train_dataloader = build_dataloader(config, 'Train', device, logger) if config['Eval']: config['Eval']['loader']['num_workers'] = 0 valid_dataloader = build_dataloader(config, 'Eval', device, logger) else: valid_dataloader = None paddle.enable_static() place = paddle.CPUPlace() exe = paddle.static.Executor(place) if 'inference_model' in global_config.keys(): # , 'inference_model'): inference_model_dir = global_config['inference_model'] else: inference_model_dir = os.path.dirname( global_config['pretrained_model']) if not (os.path.exists(os.path.join(inference_model_dir, "inference.pdmodel")) and \ os.path.exists(os.path.join(inference_model_dir, "inference.pdiparams")) ): raise ValueError( "Please set inference model dir in Global.inference_model or Global.pretrained_model for post-quantazition" ) paddleslim.quant.quant_post_static( executor=exe, model_dir=inference_model_dir, model_filename='inference.pdmodel', params_filename='inference.pdiparams', quantize_model_path=global_config['save_inference_dir'], sample_generator=sample_generator(train_dataloader), save_model_filename='inference.pdmodel', save_params_filename='inference.pdiparams', batch_size=1, batch_nums=None)
def test_scatter(): """scatter""" paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) init_parallel_env() for t in types: if paddle.distributed.ParallelEnv().local_rank == 0: np_data1 = np.array([7, 8, 9]).astype(t) np_data2 = np.array([10, 11, 12]).astype(t) else: np_data1 = np.array([1, 2, 3]).astype(t) np_data2 = np.array([4, 5, 6]).astype(t) data1 = paddle.to_tensor(np_data1) data2 = paddle.to_tensor(np_data2) if paddle.distributed.ParallelEnv().local_rank == 0: paddle.distributed.scatter(data1, src=1) else: paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1) out = data1.numpy() assert len(out) == 3 print("test_scatter %s ... ok" % t)