def get_lr_scheduler(args, train_loader): if args.optim_phase == 'Factor': every_lr_decay_step = args.every_lr_decay_step lr_scheduler = FactorScheduler(step=every_lr_decay_step, factor=0.1) elif args.optim_phase == 'MultiFactor': lr_decay_steps = [ len(train_loader) * ep for ep in args.lr_decay_epochs ] lr_scheduler = MultiFactorScheduler(step=lr_decay_steps, factor=0.1) elif args.optim_phase == 'Poly': max_update_step = args.epochs lr_scheduler = PolyScheduler(max_update=max_update_step) elif args.optim_phase == 'Cosine': max_update_step = args.epochs lr_scheduler = CosineScheduler(max_update=max_update_step) else: raise ValueError('Invalid phase {}'.format(args.optim_phase)) return lr_scheduler
def test_lognormal(): var = mx.symbol.Variable('var') data = mx.symbol.Variable('data') net_mean = mx.symbol.FullyConnected(data=data, name='fc_mean_1', num_hidden=20) net_mean = mx.symbol.Activation(data=net_mean, name='fc_mean_relu_1', act_type='relu') net_mean = mx.symbol.FullyConnected(data=data, name='fc_mean_2', num_hidden=20) net_mean = mx.symbol.Activation(data=net_mean, name='fc_mean_relu_2', act_type='relu') net_mean = mx.symbol.FullyConnected(data=net_mean, name='fc_mean_3', num_hidden=10) net_var = mx.symbol.FullyConnected(data=data, name='fc_var_1', num_hidden=10) net_var = mx.symbol.Activation(data=net_var, name='fc_var_softplus_1', act_type='softrelu') net = mx.symbol.Custom(mean=net_mean, var=net_var, name='policy', deterministic=False, entropy_regularization=0.01, op_type='LogNormalPolicy') ctx = mx.gpu() minibatch_size = 100 data_shapes = { 'data': (minibatch_size, 10), 'policy_score': (minibatch_size, ) } #, 'var':(minibatch_size,)} qnet = Base(data_shapes=data_shapes, sym_gen=net, name='PolicyNet', initializer=mx.initializer.Xavier(factor_type="in", magnitude=1.0), ctx=ctx) print qnet.internal_sym_names lr = 0.01 lr_scheduler = FactorScheduler(1000, 1.0 / 1.5) optimizer = mx.optimizer.create( name='sgd', learning_rate=lr, #momentum=0.9, clip_gradient=None, lr_scheduler=lr_scheduler, rescale_grad=1.0, wd=0.) updater = mx.optimizer.get_updater(optimizer) total_iter = 1000000 stats = numpy.zeros((total_iter, 3), dtype=numpy.float32) plt.ion() fig, ax = plt.subplots() lines, = ax.plot([], []) ax.set_autoscaley_on(True) baseline = 0 for i in range(total_iter): # for k, v in qnet.params.items(): # print k, v.asnumpy() data = numpy.random.randn(minibatch_size, 10) means = qnet.compute_internal(sym_name="fc_mean_3_output", data=data).asnumpy() vars = qnet.compute_internal(sym_name="fc_var_softplus_1_output", data=data).asnumpy() outputs = qnet.forward( is_train=True, data=data) #, var=0.5*numpy.ones((minibatch_size, ))) action = outputs[0].asnumpy() score = simple_game_multimodal(data, action, 1) baseline = baseline - 0.01 * (baseline - score.mean()) print 'score=', score.mean(), 'err=', numpy.square( means - data * data).mean(), 'var=', vars.mean(), 'baseline=', baseline stats[i] = [ score.mean(), numpy.square(means - data * data).mean(), vars.mean() ] qnet.backward(policy_score=score - baseline) norm_clipping(qnet.params_grad, 10) qnet.update(updater) if i % 10 == 0: update_line(lines, fig, ax, i, score.mean()) #numpy.square(means - data*data).mean())
def train_model(self, action): # action belongs to stage4: Training stage if action[0] == 1: # LF1 loss = mx.gluon.loss.L2Loss() else: loss = mx.gluon.loss.HuberLoss() # must set batch_size before init model batch_size = self.batch_size_option[action[1] - 1] self.config['batch_size'] = batch_size model = Model(self.action_trajectory, self.config, self.ctx, self.adj_SIPM) model.initialize(ctx=self.ctx) lr_option = [1e-3, 7e-4, 1e-4] opt_option = ['rmsprop', 'adam', 'adam'] lr = lr_option[action[2] - 1] if action[3] == 1: step = self.epochs / 10 if step < 1: step = 1 lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_scheduler}) elif action[3] == 2: opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'learning_rate': lr}) else: global_train_steps = self.training_samples // batch_size + 1 max_update_factor = 1 lr_sch = mx.lr_scheduler.PolyScheduler( max_update=global_train_steps * self.epochs * max_update_factor, base_lr=lr, pwr=2, warmup_steps=global_train_steps) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_sch}) try: # train train_time = 0. train_loader, val_loader, test_loader = self.data[batch_size] for epoch in range(self.config['epochs']): loss_value = 0 mae = 0 rmse = 0 mape = 0 train_batch_num = 0 for X in train_loader: y = X.label[0] X = X.data[0] train_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx) with autograd.record(): y = y.astype('float32') start_time = time() output = model(X) train_time += time() - start_time l = loss(output, y) if self.test: return l.backward() opt.step(batch_size) loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) train_loader.reset() loss_value /= train_batch_num mae /= train_batch_num rmse /= train_batch_num mape /= train_batch_num self.logger( train=[epoch, loss_value, mae, mape, rmse, train_time]) print(f" epoch:{epoch} ,loss:{loss_value}") model_structure = deepcopy(self.action_trajectory) model_structure.append(action) # eval eval_loss_value = 0 eval_batch_num = 0 mae = 0 rmse = 0 mape = 0 val_time = 0. for X in val_loader: y = X.label[0] X = X.data[0] eval_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx) y = y.astype('float32') start_time = time() output = model(X) val_time += time() - start_time eval_loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) eval_loss_value /= eval_batch_num mae /= eval_batch_num rmse /= eval_batch_num mape /= eval_batch_num print( f" eval_result: loss:{eval_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, time:{val_time}" ) val_loader.reset() # get reward if self.time_max <= val_time: return -1, True else: reward = -(mae - np.power(np.e, -19) * np.log2(self.time_max - val_time)) if reward < -1e2: return -1, True else: reward /= 100 self.logger(eval=[eval_loss_value, mae, mape, rmse, val_time]) self.logger.save_GNN(model, model_structure, reward / len(self.action_trajectory) + 1) return reward, False except Exception as e: self.logger.append_log_file(e.args[0]) self.logger(train=None, eval=None, test=None) traceback.print_exc() return -1, True
def main(args): filehandler = logging.FileHandler(args['log_dir'] + '/train.log') streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) batch_size = args['batch_size'] classes = 1000 num_gpus = args['num_gpus'] batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = args['num_workers'] model_name = 'efficientnet-' + args['model'] lr_decay = args['lr_decay'] lr_decay_period = args['lr_decay_period'] warmup_steps = args['warmup_epochs'] warmup_begin_lr = args['warmup_lr'] assert lr_decay_period != 0 lr_scheduler = FactorScheduler(lr_decay_period, lr_decay, warmup_steps=warmup_steps, warmup_begin_lr=warmup_begin_lr) lr_scheduler.base_lr = args['lr'] optimizer = 'rmsprop' optimizer_params = {'wd': args['wd'], 'gamma1': args['momentum'], 'learning_rate':args['lr']} if args['dtype'] != 'float32': optimizer_params['multi_precision'] = True net, input_size = get_efficientnet(model_name) net.cast(args['dtype']) if args['resume_params'] is not '': net.load_parameters(args['resume_params'], ctx=context) # Two functions for reading data from record file or raw images def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 crop_ratio = args['crop_ratio'] if args['crop_ratio'] > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter( path_imgrec=rec_train, path_imgidx=rec_train_idx, preprocess_threads=num_workers, shuffle=True, batch_size=batch_size, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], rand_mirror=True, random_resized_crop=True, max_aspect_ratio=4. / 3., min_aspect_ratio=3. / 4., max_random_area=1, min_random_area=0.08, brightness=jitter_param, saturation=jitter_param, contrast=jitter_param, pca_noise=lighting_param, ) val_data = mx.io.ImageRecordIter( path_imgrec=rec_val, path_imgidx=rec_val_idx, preprocess_threads=num_workers, shuffle=False, batch_size=batch_size, resize=resize, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], ) return train_data, val_data, batch_fn def get_data_loader(data_dir, batch_size, num_workers): normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) jitter_param = 0.4 lighting_param = 0.1 crop_ratio = args['crop_ratio'] if args['crop_ratio'] > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) return data, label transform_train = transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomFlipLeftRight(), transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param, saturation=jitter_param), transforms.RandomLighting(lighting_param), transforms.ToTensor(), normalize ]) transform_test = transforms.Compose([ transforms.Resize(resize, keep_ratio=True), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize ]) train_data = gluon.data.DataLoader( imagenet.classification.ImageNet(data_dir, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( imagenet.classification.ImageNet(data_dir, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_data, val_data, batch_fn if args['use_rec']: train_data, val_data, batch_fn = get_data_rec(args['rec_train'], args['rec_train_idx'], args['rec_val'], args['rec_val_idx'], batch_size, num_workers) else: train_data, val_data, batch_fn = get_data_loader(args['data_dir'], batch_size, num_workers) if args['mixup']: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) save_frequency = args['save_frequency'] if args['save_model'] and save_frequency: save_dir = args['log_dir'] else: save_dir = '' save_frequency = 0 def mixup_transform(label, classes, lam=1, eta=0.0): if isinstance(label, nd.NDArray): label = [label] res = [] for l in label: y1 = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) y2 = l[::-1].one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) res.append(lam * y1 + (1 - lam) * y2) return res def smooth(label, classes, eta=0.1): if isinstance(label, nd.NDArray): label = [label] smoothed = [] for l in label: res = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) smoothed.append(res) return smoothed def test(ctx, val_data): if args['use_rec']: val_data.reset() acc_top1.reset() acc_top5.reset() for i, batch in enumerate(val_data): data, label = batch_fn(batch, ctx) outputs = [net(X.astype(args['dtype'], copy=False), ag.is_training()) for X in data] acc_top1.update(label, outputs) acc_top5.update(label, outputs) _, top1 = acc_top1.get() _, top5 = acc_top5.get() return (1 - top1, 1 - top5) def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if args['resume_params'] is '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if args['no_wd']: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if args['resume_states'] is not '': trainer.load_states(args['resume_states']) if args['label_smoothing'] or args['mixup']: sparse_label_loss = False else: sparse_label_loss = True L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(args['resume_epoch'], args['num_epochs']): tic = time.time() if args['use_rec']: train_data.reset() train_metric.reset() btic = time.time() lr = lr_scheduler(epoch + 1) trainer.set_learning_rate(lr) for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) if args['mixup']: lam = np.random.beta(args['mixup_alpha'], args['mixup_alpha']) if epoch >= args['num_epochs'] - args['mixup_off_epoch']: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if args['label_smoothing']: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif args['label_smoothing']: hard_label = label label = smooth(label, classes) with ag.record(): outputs = [net(X.astype(args['dtype'], copy=False), ag.is_training()) for X in data] loss = [L(yhat, y.astype(args['dtype'], copy=False)) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size) if args['mixup']: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if args['label_smoothing']: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if args['log_interval'] and not (i + 1) % args['log_interval']: train_metric_name, train_metric_score = train_metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f' % ( epoch, i, batch_size * args['log_interval'] / (time.time() - btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * (i+1) / (time.time() - tic)) err_top1_val, err_top5_val = test(ctx, val_data) logger.info('[Epoch %d] training: %s=%f' % (epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' % (epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: best_val_score = err_top1_val net.save_parameters( '%s/%.4f-imagenet-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) trainer.save_states( '%s/%.4f-imagenet-%s-%d-best.states' % (save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, args['num_epochs'] - 1)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, args['num_epochs'] - 1)) if args['mode'] == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) train(context)
def train_model(self, actions: list): # remove [-1,-1,-1,-1] for idx in range(len(actions)): if actions[idx] == [-1, -1, -1, -1]: actions.pop(idx) # fetch training_stage_action and remove it from model structure action action = actions[0] actions.pop(0) self.action_trajectory = actions # action belongs to stage4: Training stage if action[0] == 1: # LF1 loss = mx.gluon.loss.L2Loss() else: loss = mx.gluon.loss.HuberLoss() # must set batch_size before init model batch_size = self.batch_size_option[action[1] - 1] # transformer = self.transformer[batch_size] self.config['batch_size'] = batch_size model = Model(self.action_trajectory, self.config, self.ctx, self.adj_SIPM) model.initialize(ctx=self.ctx) lr_option = [1e-3, 7e-4, 1e-4] opt_option = ['rmsprop', 'adam', 'adam'] lr = lr_option[action[2] - 1] if action[3] == 1: step = self.epochs / 10 if step < 1: step = 1 lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_scheduler}) elif action[3] == 2: opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'learning_rate': lr}) else: global_train_steps = self.training_samples // batch_size + 1 max_update_factor = 1 lr_sch = mx.lr_scheduler.PolyScheduler( max_update=global_train_steps * self.epochs * max_update_factor, base_lr=lr, pwr=2, warmup_steps=global_train_steps) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_sch}) # train train_loader, val_loader, test_loader = self.data[batch_size] model_structure = deepcopy(self.action_trajectory) model_structure.append(action) best_mae = float('inf') best_epoch = 0 for epoch in range(config['epochs']): self.logger.set_episode(epoch) loss_value = 0 mae = 0 rmse = 0 mape = 0 train_batch_num = 0 train_time = 0. for X in train_loader: y = X.label[0] X = X.data[0] train_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx) with autograd.record(): y = y.astype('float32') start_time = time() output = model(X) train_time += time() - start_time l = loss(output, y) if self.test: return l.backward() opt.step(batch_size) # loss_value_raw += l.mean().asscalar() loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) train_loader.reset() # loss_value_raw /= train_batch_num loss_value /= train_batch_num mae /= train_batch_num rmse /= train_batch_num mape /= train_batch_num train_time = (time() - train_time) / self.train_set_sample_num self.logger(train=[epoch, loss_value, mae, mape, rmse, train_time]) print( f" epoch:{epoch} ,loss:{loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, time:{train_time}" ) # eval eval_loss_value = 0 val_time = 0. eval_batch_num = 0 mae = 0 rmse = 0 mape = 0 val_time = time() for X in val_loader: y = X.label[0] X = X.data[0] eval_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx) y = y.astype('float32') start_time = time() output = model(X) val_time += time() - start_time # eval_loss_value_raw += loss(output, y).mean().asscalar() eval_loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) eval_loss_value /= eval_batch_num mae /= eval_batch_num rmse /= eval_batch_num mape /= eval_batch_num print( f" eval_result: loss:{eval_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, time:{val_time}" ) val_loader.reset() self.logger(eval=[eval_loss_value, mae, mape, rmse, val_time]) self.logger.save_GNN(model, model_structure, mae) self.logger.update_data_units() self.logger.flush_log() if mae < best_mae: best_mae = mae best_epoch = epoch if epoch - best_epoch > 10: print(f'early stop at epoch:{epoch}') break # test # load best eval metric model parameters model.load_params(os.path.join( os.path.join(self.logger.log_path, 'GNN'), 'best_GNN_model.params'), ctx=self.ctx) test_loss_value = 0 test_batch_num = 0 mae = 0 rmse = 0 mape = 0 test_time = 0. for X in test_loader: y = X.label[0] X = X.data[0] test_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx) y = y.astype('float32') start_time = time() output = model(X) test_time += time() - start_time # test_loss_value_raw += loss(output, y).mean().asscalar() test_loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) test_loss_value /= test_batch_num mae /= test_batch_num rmse /= test_batch_num mape /= test_batch_num test_loader.reset() print( f" test_result: loss:{test_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, TIME:{test_time}" ) self.logger(test=[test_loss_value, mae, mape, rmse, test_time]) self.logger.update_data_units() self.logger.flush_log() return [mae, mape, rmse, test_time]
def test_model(self, actions: list): # remove [-1,-1,-1,-1] for idx in range(len(actions)): if actions[idx] == [-1, -1, -1, -1]: actions.pop(idx) # fetch training_stage_action and remove it from model structure action action = actions[0] actions.pop(0) self.action_trajectory = actions # action belongs to stage4: Training stage if action[0] == 1: # LF1 loss = mx.gluon.loss.L2Loss() else: loss = mx.gluon.loss.HuberLoss() # must set batch_size before init model batch_size = self.batch_size_option[action[1] - 1] # transformer = self.transformer[batch_size] self.config['batch_size'] = batch_size model = Model(self.action_trajectory, self.config, self.ctx, self.adj_SIPM) model.initialize(ctx=self.ctx) lr_option = [1e-3, 7e-4, 1e-4] opt_option = ['rmsprop', 'adam', 'adam'] lr = lr_option[action[2] - 1] if action[3] == 1: step = self.epochs / 10 if step < 1: step = 1 lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_scheduler}) elif action[3] == 2: opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'learning_rate': lr}) else: global_train_steps = self.training_samples // batch_size + 1 max_update_factor = 1 lr_sch = mx.lr_scheduler.PolyScheduler( max_update=global_train_steps * self.epochs * max_update_factor, base_lr=lr, pwr=2, warmup_steps=global_train_steps) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_sch}) # train train_loader, val_loader, test_loader = self.data[batch_size] # test # load best eval metric model parameters model.load_params( f'./Log/{self.dataset_name.upper()}_experiment2_qlearning_2_test/GNN/best_GNN_model.params', ctx=self.ctx) test_loss_value = 0 test_batch_num = 0 mae = 0 rmse = 0 mape = 0 test_time = 0. for X in test_loader: y = X.label[0] X = X.data[0] test_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx) y = y.astype('float32') start_time = time() output = model(X) test_time += time() - start_time # test_loss_value_raw += loss(output, y).mean().asscalar() test_loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) test_loss_value /= test_batch_num mae /= test_batch_num rmse /= test_batch_num mape /= test_batch_num test_loader.reset() print( f" test_result: loss:{test_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, TIME:{test_time}" ) self.logger(test=[test_loss_value, mae, mape, rmse, test_time]) self.logger.update_data_units() self.logger.flush_log() return [mae, mape, rmse, test_time]
def train_net(args, ctx): logger.auto_set_dir() from symbols.tiny import resnet101_deeplab_new sym_instance = resnet101_deeplab_new() sym = sym_instance.get_symbol(NUM_CLASSES, is_train=True, memonger=False) #digraph = mx.viz.plot_network(sym, save_format='pdf') #digraph.render() # setup multi-gpu gpu_nums = len(ctx) input_batch_size = args.batch_size * gpu_nums train_data = get_data("train", DATA_DIR, LIST_DIR, len(ctx)) test_data = get_data("val", DATA_DIR, LIST_DIR, len(ctx)) # infer max shape max_scale = [args.crop_size] max_data_shape = [('data', (args.batch_size, 3, max([v[0] for v in max_scale]), max([v[1] for v in max_scale])))] max_label_shape = [('label', (args.batch_size, 1, max([v[0] for v in max_scale]), max([v[1] for v in max_scale])))] # infer shape data_shape_dict = { 'data': (args.batch_size, 3, args.crop_size[0], args.crop_size[1]), 'label': (args.batch_size, 1, args.crop_size[0], args.crop_size[1]) } pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) eval_sym_instance = resnet101_deeplab_new() # load and initialize params epoch_string = args.load.rsplit("-", 2)[1] begin_epoch = 1 if not args.scratch: begin_epoch = int(epoch_string) logger.info('continue training from {}'.format(begin_epoch)) arg_params, aux_params = load_init_param(args.load, convert=True) else: logger.info(args.load) arg_params, aux_params = load_init_param(args.load, convert=True) sym_instance.init_weights(arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) data_names = ['data'] label_names = ['label'] mod = MutableModule( sym, data_names=data_names, label_names=label_names, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(gpu_nums)], max_label_shapes=[max_label_shape for _ in xrange(gpu_nums)], fixed_param_prefix=fixed_param_prefix) # decide training params # metric fcn_loss_metric = metric.FCNLogLossMetric(args.frequent) eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [fcn_loss_metric]: eval_metrics.add(child_metric) # callback batch_end_callbacks = [ callback.Speedometer(input_batch_size, frequent=args.frequent) ] #batch_end_callbacks = [mx.callback.ProgressBar(total=train_data.size/train_data.batch_size)] epoch_end_callbacks = \ [mx.callback.module_checkpoint(mod, os.path.join(logger.get_logger_dir(),"mxnetgo"), period=1, save_optimizer_states=True), ] from mxnet.lr_scheduler import FactorScheduler lr_scheduler = FactorScheduler(800) # optimizer optimizer_params = { 'wd': 0.0005, 'learning_rate': 2.5e-2, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } logger.info("epoch scale = {}".format(EPOCH_SCALE)) mod.fit(train_data=train_data, args=args, eval_sym_instance=eval_sym_instance, eval_data=test_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callbacks, batch_end_callback=batch_end_callbacks, kvstore=kvstore, optimizer='sgld', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, epoch_scale=EPOCH_SCALE, validation_on_last=validation_on_last)
data_shapes = { 'data': (batch_size, state_dimension), 'policy_score': (batch_size, ), 'policy_backward_action': (batch_size, action_dimension), 'critic_label': (batch_size, ), 'var': (batch_size, action_dimension), } sym = actor_critic_policy_sym(action_dimension) net = Base(data_shapes=data_shapes, sym_gen=sym, name='ACNet', initializer=mx.initializer.Xavier(rnd_type='gaussian', factor_type='avg', magnitude=1.0), ctx=ctx) lr_scheduler = FactorScheduler(500, 0.1) if args.optimizer == 'sgd': optimizer = mx.optimizer.create(name='sgd', learning_rate=args.lr, lr_scheduler=lr_scheduler, momentum=0.9, clip_gradient=None, rescale_grad=1.0, wd=0.) elif args.optimizer == 'adam': optimizer = mx.optimizer.create(name='adam', learning_rate=args.lr, lr_scheduler=lr_scheduler) else: raise ValueError('optimizer must be chosen between adam and sgd') updater = mx.optimizer.get_updater(optimizer)
def train_model(self, action): # action belongs to stage4: Training stage if action[0] == 1: # LF1 loss = mx.gluon.loss.L2Loss() else: loss = mx.gluon.loss.HuberLoss() # must set batch_size before init model batch_size = self.batch_size_option[action[1] - 1] self.config['batch_size'] = batch_size model = Model(self.action_trajectory, self.config, self.ctx, self.adj_SIPM) model.initialize(ctx=self.ctx) lr_option = [1e-3, 7e-4, 1e-4] opt_option = ['rmsprop', 'adam', 'adam'] lr = lr_option[action[2] - 1] if action[3] == 1: step = self.epochs / 10 if step < 1: step = 1 lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_scheduler}) elif action[3] == 2: opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'learning_rate': lr}) else: global_train_steps = self.training_samples // batch_size + 1 max_update_factor = 1 lr_sch = mx.lr_scheduler.PolyScheduler( max_update=global_train_steps * self.epochs * max_update_factor, base_lr=lr, pwr=2, warmup_steps=global_train_steps) opt = mx.gluon.Trainer(model.collect_params(), opt_option[action[3] - 1], {'lr_scheduler': lr_sch}) self.logger(action=self.actions) model_structure = deepcopy(self.actions) try: train_loader, val_loader, test_loader = self.data[batch_size] if self.mode == 'search' or self.mode == 'train': # train train_time = 0. best_mae = float('inf') best_epoch = 0 best_test_mae = float('inf') best_test_res = None for epoch in range(self.config['epochs']): loss_value = 0 mae = 0 rmse = 0 mape = 0 train_batch_num = 0 for X in train_loader: y = X.label[0] X = X.data[0] train_batch_num += 1 X, y = X.as_in_context(self.ctx), y.as_in_context( self.ctx) with autograd.record(): y = y.astype('float32') start_time = time() output = model(X) train_time += time() - start_time l = loss(output, y) # if self.test: # return l.backward() opt.step(batch_size) loss_value += loss(output, y).mean().asscalar() mae += MAE(y, output) rmse += RMSE(y, output) mape += masked_mape_np(y, output) train_loader.reset() loss_value /= train_batch_num mae /= train_batch_num rmse /= train_batch_num mape /= train_batch_num self.logger( train=[epoch, loss_value, mae, mape, rmse, train_time]) print(f" epoch:{epoch} ,loss:{loss_value}") if self.mode == 'train': eval_loss_value, mae, rmse, mape, val_time = self.eval_model( val_loader, model, loss) self.logger( eval=[eval_loss_value, mae, mape, rmse, val_time]) self.logger.save_GNN(model, model_structure, mae) if mae < best_mae: best_mae = mae best_epoch = epoch if epoch - best_epoch > 10: print(f'early stop at epoch:{epoch}') break mae, mape, rmse, test_time = self.test_model_without_load( test_loader, model, loss) if mae < best_test_mae: best_test_mae = mae best_test_res = [mae, mape, rmse, test_time] print(f'test_res:{best_test_res}') if self.mode == 'search': eval_loss_value, mae, rmse, mape, val_time = self.eval_model( val_loader, model, loss) # get reward if self.time_max - val_time > 0: reward = -mae / 10 + np.power( np.e, -5) * np.log2(self.time_max - val_time) else: reward = -10 if np.isnan(reward) or np.isinf(reward) or reward < -100: self.logger.append_log_file(f"Warning: reward={reward}") reward = -10 self.logger(eval=[eval_loss_value, mae, mape, rmse, val_time]) self.logger.save_GNN(model, model_structure, reward / len(self.action_trajectory) + 1) return reward, False elif self.mode == 'train': self.logger.append_log_file(f'best_test_res:{best_test_res}') mae, mape, rmse, test_time = self.test_model(test_loader, loss) return best_test_res, [mae, mape, rmse, test_time] elif self.mode == 'test': mae, mape, rmse, test_time = self.test_model(test_loader, loss) return None, [mae, mape, rmse, test_time] except Exception as e: self.logger.append_log_file(e.args[0]) self.logger(train=None, eval=None, test=None) traceback.print_exc() return -10, True