def test_ofa(self): ofa_model = OFA(self.model, self.run_config, distill_config=self.distill_config) start_epoch = 0 for idx in range(len(self.run_config.n_epochs)): cur_idx = self.run_config.n_epochs[idx] for ph_idx in range(len(cur_idx)): cur_lr = self.run_config.init_learning_rate[idx][ph_idx] adam = fluid.optimizer.Adam( learning_rate=cur_lr, parameter_list=( ofa_model.parameters() + ofa_model.netAs_param)) for epoch_id in range(start_epoch, self.run_config.n_epochs[idx][ph_idx]): for model_no in range(self.run_config.dynamic_batch_size[ idx]): output, _ = ofa_model(self.data) loss = fluid.layers.reduce_mean(output) if self.distill_config.mapping_layers != None: dis_loss = ofa_model.calc_distill_loss() loss += dis_loss dis_loss = dis_loss.numpy()[0] else: dis_loss = 0 print('epoch: {}, loss: {}, distill loss: {}'.format( epoch_id, loss.numpy()[0], dis_loss)) loss.backward() adam.minimize(loss) adam.clear_gradients() start_epoch = self.run_config.n_epochs[idx][ph_idx]
def test_ofa(): model = Model() teacher_model = Model() default_run_config = { 'train_batch_size': 256, 'n_epochs': [[1], [2, 3], [4, 5]], 'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]], 'dynamic_batch_size': [1, 1, 1], 'total_images': 50000, #1281167, 'elastic_depth': (2, 5, 8) } run_config = RunConfig(**default_run_config) default_distill_config = { 'lambda_distill': 0.01, 'teacher_model': teacher_model, 'mapping_layers': ['models.0.fn'] } distill_config = DistillConfig(**default_distill_config) ofa_model = OFA(model, run_config, distill_config=distill_config) train_dataset = paddle.vision.datasets.MNIST(mode='train', backend='cv2', transform=transform) train_loader = paddle.io.DataLoader(train_dataset, places=place, feed_list=[image, label], drop_last=True, batch_size=64) start_epoch = 0 for idx in range(len(run_config.n_epochs)): cur_idx = run_config.n_epochs[idx] for ph_idx in range(len(cur_idx)): cur_lr = run_config.init_learning_rate[idx][ph_idx] adam = paddle.optimizer.Adam( learning_rate=cur_lr, parameter_list=(ofa_model.parameters() + ofa_model.netAs_param)) for epoch_id in range(start_epoch, run_config.n_epochs[idx][ph_idx]): for batch_id, data in enumerate(train_loader()): dy_x_data = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape(-1, 1) img = paddle.dygraph.to_variable(dy_x_data) label = paddle.dygraph.to_variable(y_data) label.stop_gradient = True for model_no in range(run_config.dynamic_batch_size[idx]): output, _ = ofa_model(img, label) loss = F.mean(output) dis_loss = ofa_model.calc_distill_loss() loss += dis_loss loss.backward() if batch_id % 10 == 0: print( 'epoch: {}, batch: {}, loss: {}, distill loss: {}' .format(epoch_id, batch_id, loss.numpy()[0], dis_loss.numpy()[0])) ### accumurate dynamic_batch_size network of gradients for same batch of data ### NOTE: need to fix gradients accumulate in PaddlePaddle adam.minimize(loss) adam.clear_gradients() start_epoch = run_config.n_epochs[idx][ph_idx]